diff --git a/.readthedocs.yml b/.readthedocs.yml index 744628c8e7..e1d8828eb2 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -5,6 +5,7 @@ build: tools: python: "3.10" apt_packages: + - graphviz - default-jre-headless jobs: pre_install: diff --git a/doc/htmldoc/benchmark_results.rst b/doc/htmldoc/benchmark_results.rst index 46954b0ead..bb7fa8b646 100644 --- a/doc/htmldoc/benchmark_results.rst +++ b/doc/htmldoc/benchmark_results.rst @@ -7,6 +7,7 @@ NEST performance benchmarks NEST performance is continuously monitored and improved across various network sizes. Here we show benchmarking results for NEST version 3.8 on Jureca-DC [1]_. The benchmarking framework and the structure of the graphs is described in [2]_. +For details on `State Propagation` (i.e., `Simulation Run`), see the guides :ref:`built_in_timers` and :ref:`run_simulations` Strong scaling experiment of the Microcircuit model [3]_ --------------------------------------------------------- @@ -45,11 +46,11 @@ Strong scaling experiment of the Multi-area-model [5]_ :columns: 10 Dynamical regime: Ground state - + .. image:: /static/img/mam_ground-state_benchmark.png Dynamical regime: Metastable state - + .. image:: /static/img/mam_metastable-state_benchmark.png @@ -96,6 +97,8 @@ Weak scaling experiment of the HPC benchmark model [6]_ .. seealso:: + * Guide to :ref:`Built-in timers ` and :ref:`run_simulations`. + Example networks: * :doc:`/auto_examples/Potjans_2014/index` diff --git a/doc/htmldoc/conf.py b/doc/htmldoc/conf.py index e4d309a098..ceb79ba915 100644 --- a/doc/htmldoc/conf.py +++ b/doc/htmldoc/conf.py @@ -22,6 +22,8 @@ import json import os + +# import shutil import subprocess import sys from pathlib import Path @@ -44,6 +46,7 @@ extensions = [ "sphinx_gallery.gen_gallery", "list_examples", + "sphinx.ext.graphviz", "sphinx.ext.autodoc", "sphinx.ext.napoleon", "sphinx.ext.autosummary", @@ -101,7 +104,8 @@ # |version| and |release|, also used in various other places throughout the # built documents. # - +graphviz_output_format = "svg" +graphviz_dot = "dot" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # diff --git a/doc/htmldoc/hpc/benchmarking.rst b/doc/htmldoc/hpc/benchmarking.rst index 870baf86a9..282184d01f 100644 --- a/doc/htmldoc/hpc/benchmarking.rst +++ b/doc/htmldoc/hpc/benchmarking.rst @@ -33,18 +33,20 @@ For more details on the conceptual ideas behind beNNch, refer to Albers et al. ( The left graph shows the absolute wall-clock time measured with Python-level timers for both network construction and state propagation. Error bars indicate variability across three simulation repeats with different random seeds. The top right graph displays the real-time factor defined as wall-clock time normalized by the model time. - Built-in timers resolve four different phases of the state propagation: update, collocation, communication, and delivery. + :ref:`Built-in timers ` resolve four different phases of the state propagation: update, collocation, communication, and delivery. Pink error bars show the same variability of state propagation as the left graph. The lower right graph shows the relative contribution of these phases to the state-propagation time. .. seealso:: - For further details, see the accompanying `beNNch GitHub Page `_. - And for a detailed step-by-step walk though see `Walk through guide `_. + * For further details, see the accompanying `beNNch GitHub Page `_. - Example PyNEST script: :doc:`../auto_examples/hpc_benchmark` + * For a detailed step-by-step walk though see `Walk through guide `_. + * Example PyNEST script: :doc:`../auto_examples/hpc_benchmark` + + * Guide to :ref:`built_in_timers` and :ref:`running simulations `. References ---------- diff --git a/doc/htmldoc/nest_behavior/built-in_timers.rst b/doc/htmldoc/nest_behavior/built-in_timers.rst index e9f6a914f7..aad17d010e 100644 --- a/doc/htmldoc/nest_behavior/built-in_timers.rst +++ b/doc/htmldoc/nest_behavior/built-in_timers.rst @@ -59,74 +59,115 @@ In the context of NEST performance monitoring, other useful kernel attributes ar Detailed timers --------------- -Detailed built-in timers can be activated (and again deactivated) prior to compilation through the CMake flag -``-Dwith-detailed-timers=ON``. They provide further insights into the time NEST spends in different phases of the -simulation cycle, but they can impact the runtime. Therefore, detailed timers are by default inactive. +Detailed built-in timers can be activated (and again deactivated) prior to compilation through the CMake flag: -If detailed timers are active, the following time measurements are available as kernel attributes: +``-Dwith-detailed-timers=ON``. -+-------------------------------------------+-----------------------------------+----------------------------------+ -| Name | Explanation | Part of | -+===========================================+===================================+==================================+ -| ``time_gather_target_data`` | Cumulative time for communicating | ``time_communicate_prepare`` | -| | connection information from | | -| | postsynaptic to presynaptic side | | -+-------------------------------------------+-----------------------------------+----------------------------------+ -| ``time_communicate_target_data`` | Cumulative time for core MPI | ``time_gather_target_data`` | -| | communication when gathering | | -| | target data | | -+-------------------------------------------+-----------------------------------+----------------------------------+ -| ``time_update`` | Time for neuron update | ``time_simulate`` | -+-------------------------------------------+-----------------------------------+----------------------------------+ -| ``time_gather_spike_data`` | Time for complete spike exchange | ``time_simulate`` | -| | after update phase | | -+-------------------------------------------+-----------------------------------+----------------------------------+ -| ``time_collocate_spike_data`` | Time to collocate MPI send buffer | ``time_gather_spike_data`` | -| | from spike register | | -+-------------------------------------------+-----------------------------------+----------------------------------+ -| ``time_communicate_spike_data`` | Time for communicating spikes | ``time_gather_spike_data`` | -| | between compute nodes | | -+-------------------------------------------+-----------------------------------+----------------------------------+ -| ``time_deliver_spike_data`` | Time to deliver events from the | ``time_gather_spike_data`` | -| | MPI receive buffers to their | | -| | local synaptic targets (including | | -| | synaptic update, e.g. STDP | | -| | synapses) and to the spike ring | | -| | buffers of the corresponding | | -| | postsynaptic neurons | | -+-------------------------------------------+-----------------------------------+----------------------------------+ -| ``time_omp_synchronization_construction`` | Synchronization time of threads | ``time_construction_create``, | -| | during network construction. | ``time_construction_connect``, | -| | | ``time_communicate_prepare`` | -+-------------------------------------------+-----------------------------------+----------------------------------+ -| ``time_omp_synchronization_simulation`` | Synchronization time of threads | ``time_simulate`` | -| | during simulation. | | -+-------------------------------------------+-----------------------------------+----------------------------------+ - -MPI synchronization timer -------------------------- -In order to measure synchronization time between multiple MPI processes, an additional timer can be activated on demand -via the ``-Dwith-mpi-sync-timer=ON`` CMake flag. This timer measures the time between the end of a process' update phase -(i.e., neuron state propagation) and start of collective communication of spikes between all MPI processes. This timer -adds an additional MPI barrier right before the start of communication, which might affect performance. - -+------------------------------+-----------------------------------------+ -| Name | Explanation | -+==============================+=========================================+ -| ``time_mpi_synchronization`` | Time spent waiting for other processes. | -+------------------------------+-----------------------------------------+ - -Multi-threaded timers ---------------------- -In previous NEST versions, only the master thread measured timers. Since NEST 3.9, timers which measure time spent exclusively in multi-threaded environments are recorded by each thread individually. - -The legacy timer behavior can be restored via the ``-Dwith-threaded-timers=OFF`` CMake flag. - -Wall-time vs. CPU-time -------------------------- -All timers in NEST measure the actual wall-time spent between starting and stopping the timer. In order to only measure -time spent on calculations, there is an additional variant for each of the timers above, suffixed with ``_cpu``. They -can be accessed in the exact same way. For example: -:: +They provide further insights into the time NEST spends in different phases of the +simulation cycle so they can be useful for :ref:`benchmarking performance `, but they can impact the runtime. +Therefore, detailed timers are by default inactive. + +.. grid:: + :gutter: 2 + + .. grid-item-card:: **Simulation run (state propagation) diagram** + :columns: 5 + :class-item: sd-text-center + + .. graphviz:: /simulation_run.dot + :name: Simulation run (State propagation) + :caption: Simplified sequence of operations in the simulation run, organized in a top-down manner with a focus on timers. + + Within the `simulate timer` section, parallel processes + (`OMP Parallel`) manage time-driven loops, handling tasks such as delivering spike data, and updating timers. + The `OMP Master` section is responsible for gathering and communicating + spike data, involving steps like collocating data and advancing the simulation time. `OMP + barriers` are used to ensure thread synchronization at key points (for more details please see `Jordan et al. 2018 + `_). + The timers are indicated in white or light grey. + + For source code see: `SimulationManager::update_ `_ + and `EventDeliveryManager::gather_spike_data `_ + + + + .. grid-item:: + :columns: 7 + + **Multi-threaded timers** + + In previous NEST versions, only the master thread measured timers (OMP_Master). + Since NEST 3.9, timers which measure time spent exclusively in multi-threaded environments (OMP_Parallel) + are recorded by each thread individually. + + The legacy timer behavior can be restored via the CMake flag: + + ``-Dwith-threaded-timers=OFF`` + + **Wall-time vs. CPU-time** + + All timers in NEST measure the actual wall-time spent between starting and stopping the timer. In order to only measure + time spent on calculations, there is an additional variant for each of the timers above, suffixed with ``_cpu``. They + can be accessed in the exact same way. For example: + :: + + nest.time_simulate_cpu + + **MPI synchronization timer** + + In order to measure synchronization time between multiple MPI processes, an additional timer can be activated on demand + via the CMake flag + + ``-Dwith-mpi-sync-timer=ON``. + + This timer measures the time between the end of a process' update phase + (i.e., neuron state propagation) and start of collective communication of spikes between all MPI processes. This timer + adds an additional MPI barrier right before the start of communication, which might affect performance. + + + .. seealso:: + + - For more information see the :ref:`run_simulations` guide + +Kernel attribrutes for detailed timers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If detailed timers are active, the following time measurements are available as kernel attributes: - nest.time_simulate_cpu +.. list-table:: + :widths: 30 40 30 + :header-rows: 1 + + * - Name + - Explanation + - Part of + * - ``time_gather_target_data`` + - Cumulative time for communicating connection information from postsynaptic to presynaptic side + - ``time_communicate_prepare`` + * - ``time_communicate_target_data`` + - Cumulative time for core MPI communication when gathering target data + - ``time_gather_target_data`` + * - ``time_update`` + - Time for neuron update + - ``time_simulate`` + * - ``time_gather_spike_data`` + - Time for complete spike exchange after update phase + - ``time_simulate`` + * - ``time_collocate_spike_data`` + - Time to collocate MPI send buffer from spike register + - ``time_gather_spike_data`` + * - ``time_communicate_spike_data`` + - Time for communicating spikes between compute nodes + - ``time_gather_spike_data`` + * - ``time_deliver_spike_data`` + - Time to deliver events from the MPI receive buffers to their local synaptic targets (including synaptic update, e.g. STDP synapses) and to the spike ring buffers of the corresponding postsynaptic neurons + - ``time_gather_spike_data`` + * - ``time_mpi_synchronization`` + - Time spent waiting for other processes + - ``time_communicate_spike_data`` + * - ``time_omp_synchronization_construction`` + - Synchronization time of threads during network construction. + - ``time_construction_create``, ``time_construction_connect``, ``time_communicate_prepare`` + * - ``time_omp_synchronization_simulation`` + - Synchronization time of threads during simulation. + - ``time_simulate`` diff --git a/doc/htmldoc/nest_behavior/running_simulations.rst b/doc/htmldoc/nest_behavior/running_simulations.rst index 8d4ec785bd..9454dae125 100644 --- a/doc/htmldoc/nest_behavior/running_simulations.rst +++ b/doc/htmldoc/nest_behavior/running_simulations.rst @@ -327,3 +327,4 @@ threads or processes): ``{"print_time": False}`` to avoid the overhead of the print calls. In these cases, the real-time factor can be computed by measuring the wall-clock time manually and dividing by the set model time. + For details on timers in NEST see :ref:`built_in_timers`. diff --git a/doc/htmldoc/simulation_run.dot b/doc/htmldoc/simulation_run.dot new file mode 100644 index 0000000000..8ee7176a31 --- /dev/null +++ b/doc/htmldoc/simulation_run.dot @@ -0,0 +1,92 @@ + digraph G { + node [fontname="Helvetica,Arial,sans-serif", shape="box", fontcolor="white", style="filled", fillcolor="#0E6A93"] + rankdir="LR" + bgcolor="#ff6633" + label="Simulation Run" + labelloc="t" + fontcolor="white" + fontname="Helvetica,Arial,sans-serif" + + subgraph cluster_simulate_timer { + label="time_simulate" + style=filled fontcolor="black" fontname="Courier New, Courier, monospace" + fillcolor="white" + subgraph cluster_OMP_Parallel { + label="OMP Parallel" + style=filled + fillcolor="#0E6A93" fontcolor="white" fontname="Helvetica,Arial,sans-serif" + subgraph cluster_time_driven_loop { + label="Time driven loop" + style=filled + fillcolor="#072f42" + subgraph cluster_deliver_spike_data_timer { + label="time_deliver_spike_data" + style=filled fontcolor="black" fontname="Courier New, Courier, monospace" + fillcolor="white" + subgraph cluster_deliver_events { + label="Deliver events" + style=filled fontcolor="white" fontname="Helvetica,Arial,sans-serif" + fillcolor="#072f42" + "plasticity update" + } + } + subgraph cluster_update_timer { + label="time_update" + style=filled fontcolor="black" fontname="Courier New, Courier, monospace" + fillcolor="white" + "node update" + } + subgraph cluster_omp_synchronization_timer { + label="time_omp_synchronization_simulation" + style=filled fontcolor="black" fontname="Courier New, Courier, monospace" + fillcolor="white" + "OMP barrier" + } + subgraph cluster_omp_master { + label="OMP Master" + style=filled fontcolor="white" + fillcolor="#0E6A93" + "advance time" + + subgraph cluster_gather_spike_data_timer{ + label= "time_gather_spike_data" + style=filled fontcolor="black" fontname="Courier New, Courier, monospace" + fillcolor="white" + subgraph cluster_gather_spike_Data{ + label="Gather spike data" + style=filled fontcolor="white" fontname="Helvetica, Arial, sans-serif" + fillcolor="#072f42" + subgraph cluster_repeat_once { + label="Repeat once if spike buffer too small" + style=filled fontcolor="white" + fillcolor="#0E6A93" + + + subgraph cluster_collocate_spike_data_timer { + label="time_collocate_spike_data" + style=filled fontcolor="black" fontname="Courier New, Courier, monospace" + fillcolor="white" + "collocate spike data" + } + + subgraph cluster_communicate_spike_data_timer { + label= "time_communicate_spike_data" + style=filled fontcolor="black" fontname="Courier New, Courier, monospace" + fillcolor="white" + "communicate spike data" + subgraph cluster_mpi_sync_timer { + label= "time_mpi_synchronization" + style=filled fontcolor="black" fontname="Courier New, Courier, monospace" + fillcolor="#f1f1f1" + "MPI barrier" + + } + } + } + } + } + } + } + } + } + } diff --git a/doc/requirements.txt b/doc/requirements.txt index e4eb2c4dc8..c9a00c79c1 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -8,6 +8,7 @@ docutils>=0.19 example>=0.1.0 flask>=2.3.2 flask_cors>=3.0.10 +graphviz ipykernel>=6.22.0 ipython>=8.11.0 jupyter>=1.0.0