[looptree] New latency model and updated returned statistics

Michael Gilbert · Michael Gilbert · commit 56224de7b1ef · 2025-03-28T19:12:21.000-04:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.4)
+cmake_minimum_required(VERSION 3.5)
 project(timeloop_python)
 
 if(DEFINED ENV{TIMELOOP_INCLUDE_PATH})
diff --git a/pytimeloop/isl/reduction.py b/pytimeloop/isl/reduction.py
@@ -6,3 +6,16 @@ def make_reduction_map(space, dims_out_first, n_dims_out):
     """
     return isl.Map.identity(space.map_from_set())\
                 .project_out(isl.dim_type.in_, dims_out_first, n_dims_out)
+
+
+def make_reduction_map_from_mask(space, mask):
+    """
+    Creates a reduction map that removes dimensions marked True in `mask`,
+    e.g., if mask is [False, True, False] and space is { [x, y, z] }, the
+    result is { [x, z] -> [x, y, z] }
+    """
+    isl_map = isl.Map.identity(space.map_from_set())
+    for i, is_reduced in reversed(list(enumerate(mask))):
+        if is_reduced:
+            isl_map = isl_map.project_out(isl.dim_type.in_, i, 1)
+    return isl_map
diff --git a/pytimeloop/isl/sum.py b/pytimeloop/isl/sum.py
@@ -1,6 +1,7 @@
+from collections.abc import Sequence
 import islpy as isl
 
-from .reduction import make_reduction_map
+from .reduction import make_reduction_map, make_reduction_map_from_mask
 
 
 def sum_until_idx(n_dims_left: int, pw_qp):
@@ -11,3 +12,9 @@ def sum_until_idx(n_dims_left: int, pw_qp):
                                        dims_out_first,
                                        n_dims_out)
     return reduction_map.apply_pw_qpolynomial(pw_qp)
+
+
+def sum_with_mask(mask: Sequence[bool], pw_qp):
+    reduction_map = make_reduction_map_from_mask(pw_qp.get_domain_space(),
+                                                 mask)
+    return reduction_map.apply_pw_qpolynomial(pw_qp)
diff --git a/pytimeloop/looptree/accesses.py b/pytimeloop/looptree/accesses.py
@@ -2,9 +2,12 @@
 from collections.abc import Mapping
 from numbers import Number
 
+from bindings.looptree import TemporalTag, SequentialTag, PipelineTemporalTag
+
 import islpy as isl
 
 from pytimeloop.isl.singular import get_sum_of_pw_qpolynomial
+from pytimeloop.isl.sum import sum_with_mask
 from pytimeloop.looptree.mapping_utilities import *
 
 
@@ -29,7 +32,8 @@ def reads_and_writes_from_fill_by_parent(fills: Mapping,
                                          reads_to_parent,
                                          mapping,
                                          workload,
-                                         is_path=False):
+                                         is_path=False,
+                                         per_unit=False):
     mapping = mapping['nodes']
     dspace_id_to_name = workload.data_space_id_to_name()
     einsum_id_to_name = workload.einsum_id_to_name()
@@ -49,8 +53,33 @@ def reads_and_writes_from_fill_by_parent(fills: Mapping,
     for (buffer_id, dspace_id, einsum_id), (tags, fill) in fills.items():
         read_to_parent = reads_to_parent[(buffer_id, dspace_id, einsum_id)][1]
 
-        read_to_parent = get_sum_of_pw_qpolynomial(read_to_parent)
-        fill = get_sum_of_pw_qpolynomial(fill)
+        if not per_unit:
+            read_to_parent = get_sum_of_pw_qpolynomial(read_to_parent)
+            fill = get_sum_of_pw_qpolynomial(fill)
+        else:
+            fill = sum_with_mask(
+                [
+                    (
+                        isinstance(t, TemporalTag) or
+                        isinstance(t, PipelineTemporalTag) or
+                        isinstance(t, SequentialTag)
+                    )
+                    for t in tags
+                ],
+                fill
+            ).max().to_python()
+            n_read_to_parent_dim = read_to_parent.dim(isl.dim_type.in_)
+            read_to_parent = sum_with_mask(
+                [
+                    (
+                        isinstance(t, TemporalTag) or
+                        isinstance(t, PipelineTemporalTag) or
+                        isinstance(t, SequentialTag)
+                    )
+                    for t in tags[:n_read_to_parent_dim]
+                ],
+                read_to_parent
+            ).max().to_python()
 
         dspace_name = dspace_id_to_name[dspace_id]
         einsum_name = einsum_id_to_name[einsum_id]
@@ -61,24 +90,32 @@ def reads_and_writes_from_fill_by_parent(fills: Mapping,
             key = (parent_buffer, dspace_name, einsum_name)
             if dspace_id in workload.tensors_written_by_einsum(einsum_id):
                 writes[key] += read_to_parent
+                reads[key] += read_to_parent
                 # Subtracted term: elided first read of a read-write tensor
-                reads[key] += \
-                    read_to_parent - workload.get_tensor_volume(dspace_id)
+                # TODO: figure out how to do this per unit
+                if not per_unit:
+                    reads[key] -= workload.get_tensor_volume(dspace_id)
             elif dspace_id in workload.tensors_read_by_einsum(einsum_id):
                 reads[key] += read_to_parent
         # Fills will write into current buffer except for compute (which does
         # not have write action) and top-level buffer
         if buffer_id not in compute_targets and parent_buffer is not None:
             if dspace_id in workload.tensors_written_by_einsum(einsum_id):
-                writes[(buffer_id, dspace_name, einsum_name)] += \
-                    fill - workload.get_tensor_volume(dspace_id)
+                writes[(buffer_id, dspace_name, einsum_name)] += fill
+                if not per_unit:
+                    writes[(buffer_id, dspace_name, einsum_name)] -= \
+                        workload.get_tensor_volume(dspace_id)
             else:
                 writes[(buffer_id, dspace_name, einsum_name)] += fill
 
     return reads, writes
 
 
-def reads_and_writes_from_fill_by_peer(fills: Mapping, mapping, workload, is_path=False):
+def reads_and_writes_from_fill_by_peer(fills: Mapping,
+                                       mapping,
+                                       workload,
+                                       is_path=False,
+                                       per_unit=False):
     mapping = mapping['nodes']
     dspace_id_to_name = workload.data_space_id_to_name()
     einsum_id_to_name = workload.einsum_id_to_name()
@@ -89,14 +126,27 @@ def reads_and_writes_from_fill_by_peer(fills: Mapping, mapping, workload, is_pat
     einsums_with_complete_mappings = get_einsums_with_complete_mappings(mapping, workload, is_path)
 
     for (buffer_id, dspace_id, einsum_id), (tags, fill) in fills.items():
-        fill = get_sum_of_pw_qpolynomial(fill)
+        if not per_unit:
+            fill = get_sum_of_pw_qpolynomial(fill)
+        else:
+            fill = sum_with_mask(
+                [
+                    (
+                        isinstance(t, TemporalTag) or
+                        isinstance(t, PipelineTemporalTag) or
+                        isinstance(t, SequentialTag)
+                    )
+                    for t in tags
+                ],
+                fill
+            ).max().to_python()
         einsum_name = einsum_id_to_name[einsum_id]
         dspace_name = dspace_id_to_name[dspace_id]
         if einsum_id not in einsums_with_complete_mappings:
             continue
 
         reads[(buffer_id, dspace_name, einsum_name)] = fill
-        writes[(buffer_id, dspace_name, einsum_name)] = 0 # already accounted for in above
+        writes[(buffer_id, dspace_name, einsum_name)] = 0 # already accounted for in fill_by_parent
 
     return reads, writes
 
diff --git a/pytimeloop/looptree/energy.py b/pytimeloop/looptree/energy.py
@@ -12,7 +12,8 @@ def gather_actions(looptree_results, mapping, workload, bindings, is_path=False)
 
     einsum_name_to_id = workload.einsum_name_to_id()
 
-    einsums_with_complete_mapping = get_einsums_with_complete_mappings(mapping['nodes'], workload, is_path)
+    einsums_with_complete_mapping = \
+        get_einsums_with_complete_mappings(mapping['nodes'], workload, is_path)
     einsums_with_complete_mapping = {
         e if isinstance(e, int) else einsum_name_to_id[e]
         for e in einsums_with_complete_mapping
@@ -94,7 +95,7 @@ def gather_ops(ops, einsums_with_complete_mapping):
         if einsum_id not in einsums_with_complete_mapping:
             continue
         if isinstance(v, isl.PwQPolynomial):
-            total += get_sum_of_pw_qpolynomial(v).to_python()
+            total += get_sum_of_pw_qpolynomial(v)
         elif isinstance(v, Number):
             total += v
         else:
diff --git a/pytimeloop/looptree/latency/__init__.py b/pytimeloop/looptree/latency/__init__.py
@@ -1 +1 @@
-from .latency import compute_latency
+from .latency import get_latency
diff --git a/pytimeloop/looptree/latency/latency.py b/pytimeloop/looptree/latency/latency.py
@@ -1,35 +1,84 @@
 from collections import defaultdict
 
 from pytimeloop.isl.singular import get_value_from_singular_qpolynomial
+from pytimeloop.looptree.accesses import (
+    reads_and_writes_from_fill_by_parent,
+    reads_and_writes_from_fill_by_peer
+)
 from pytimeloop.looptree.latency.processors import LATENCY_PROCESSORS
+from pytimeloop.looptree.des import LooptreeOutput
 
 from bindings.looptree import SpatialTag
 
 
-def get_latency(actions, mapping, temporal_steps, workload, arch):
-    comp_latency = compute_latency(mapping, temporal_steps, workload)
-    mem_latency = memory_latency(actions, arch)
-    return max(comp_latency, max(mem_latency.values()))
+def get_latency(looptree_results: LooptreeOutput,
+                mapping,
+                workload,
+                arch,
+                bindings):
+    comp_latency = compute_latency(mapping,
+                                   looptree_results.temporal_steps,
+                                   workload)
+    mem_latency = memory_latency(looptree_results,
+                                 arch,
+                                 mapping,
+                                 workload,
+                                 bindings)
+    overall_latency = max(comp_latency, max(mem_latency.values()))
+    return overall_latency, comp_latency, mem_latency
 
 
 def compute_latency(mapping, temporal_steps, workload):
     return get_value_from_singular_qpolynomial(
-        _compute_latency(mapping, 0, temporal_steps, workload)[1]
+        _compute_latency(mapping.nodes, 0, temporal_steps, workload)[1]
     ).to_python()
 
 
-def memory_latency(actions, arch):
+def memory_latency(looptree_results: LooptreeOutput,
+                   arch,
+                   mapping,
+                   workload,
+                   bindings):
+    reads, writes = reads_and_writes_from_fill_by_parent(
+        looptree_results.fills,
+        looptree_results.reads_to_parent,
+        mapping,
+        workload,
+        per_unit=True
+    )
+
+    peer_reads, peer_writes = reads_and_writes_from_fill_by_peer(
+        looptree_results.reads_to_peer,
+        mapping,
+        workload,
+        per_unit=True
+    )
+
     component_to_read_writes = defaultdict(lambda: [None, None])
-    for (component, action), count in actions.items():
-        if action == 'read':
-            component_to_read_writes[component][0] = count
-        elif action == 'write':
-            component_to_read_writes[component][1] = count
+    for level, component in bindings.items():
+        read_count = sum(reads[key] for key in reads if key[0] == level)
+        read_count += sum(peer_reads[key]
+                          for key in peer_reads if key[0] == level)
+        write_count = sum(writes[key] for key in writes if key[0] == level)
+        write_count += sum(peer_writes[key]
+                           for key in peer_writes if key[0] == level)
+        if component not in component_to_read_writes:
+            component_to_read_writes[component][0] = read_count
+            component_to_read_writes[component][1] = write_count
+        else:
+            component_to_read_writes[component][0] += read_count
+            component_to_read_writes[component][1] += write_count
 
     component_latency = {}
     bandwidths = get_bandwidth(arch)
     for component, (reads, writes) in component_to_read_writes.items():
         read_bw, write_bw, shared_bw = bandwidths[component]
+
+        # For numerical stability
+        read_bw += 1e-8
+        write_bw += 1e-8
+        shared_bw += 1e-8
+
         # All shared bw for writing
         write_latency = writes / (write_bw + shared_bw)
         read_latency = reads / read_bw
@@ -58,6 +107,8 @@ def get_bandwidth(arch):
         n_rd_ports = attributes.get('n_rd_ports', 0)
         n_wr_ports = attributes.get('n_wr_ports', 0)
         n_rdwr_ports = attributes.get('n_rdwr_ports', 0)
+        if n_rd_ports + n_wr_ports + n_rdwr_ports < 1:
+            n_rdwr_ports = 1
 
         width = attributes['width']
         datawidth = attributes['datawidth']
diff --git a/pytimeloop/looptree/run.py b/pytimeloop/looptree/run.py
@@ -1,3 +1,4 @@
+from dataclasses import dataclass
 from pathlib import Path
 
 import islpy as isl
@@ -7,14 +8,24 @@
 
 from pytimeloop.file import gather_yaml_configs
 
+from pytimeloop.looptree.capacity import compute_capacity_usage
 from pytimeloop.looptree.des import deserialize_looptree_output
 from pytimeloop.looptree.energy import gather_actions, compute_energy_from_actions
-from pytimeloop.looptree.latency import compute_latency
+from pytimeloop.looptree.latency import get_latency
 
 from pytimeloop.timeloopfe.v4fused import Specification
 from pytimeloop.timeloopfe.common.backend_calls import call_accelergy_verbose
 
 
+@dataclass
+class LoopTreeStatistics:
+    latency: float
+    energy: float
+    actions: dict
+    memory_latency: dict
+    capacity_usage: dict
+
+
 def run_looptree(config_dir, paths, tmp_path, bindings, call_accelergy):
     yaml_str = gather_yaml_configs(config_dir, paths)
     config = Config(yaml_str, 'yaml')
@@ -39,8 +50,22 @@ def run_looptree(config_dir, paths, tmp_path, bindings, call_accelergy):
     actions = gather_actions(result, spec.mapping, workload, bindings)
     energy = compute_energy_from_actions(actions, spec.ERT)
 
-    latency = compute_latency(spec.mapping.nodes,
-                              result.temporal_steps,
-                              workload)
+    latency, comp_latency, mem_latency = get_latency(result,
+                                                     spec.mapping,
+                                                     workload,
+                                                     spec.architecture,
+                                                     bindings)
+
+    capacity_usage = compute_capacity_usage(spec.mapping.nodes,
+                                            result.occupancy,
+                                            workload)
+    component_capacity_usage = {}
+    for level, component in bindings.items():
+        if level in capacity_usage:
+            component_capacity_usage[component] = capacity_usage[level]
 
-    return latency, energy
+    return LoopTreeStatistics(latency,
+                              energy,
+                              actions,
+                              mem_latency,
+                              capacity_usage=component_capacity_usage)
diff --git a/tests/looptree/test_latency.py b/tests/looptree/test_latency.py
@@ -0,0 +1,31 @@
+from pathlib import Path
+import unittest
+
+from pytimeloop.looptree.run import run_looptree
+
+from tests.util import TEST_TMP_DIR
+
+
+class TestLatency(unittest.TestCase):
+    def test_fused_sequential(self):
+        BINDINGS = {
+            0: 'MainMemory',
+            1: 'GlobalBuffer',
+            2: 'GlobalBuffer',
+            3: 'GlobalBuffer',
+            4: 'MACC'
+        }
+
+        stats = run_looptree(
+            Path(__file__).parent.parent / 'test_configs',
+            [
+                'looptree-test-fused.yaml',
+                'cascaded_mm.workload.yaml',
+                'three_level.arch.yaml'
+            ],
+            TEST_TMP_DIR,
+            BINDINGS,
+            True
+        )
+
+        self.assertEqual(54, stats.latency)
diff --git a/tests/looptree/test_run.py b/tests/looptree/test_run.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-cmake_minimum_required(VERSION 3.4)`
	`1`	`+cmake_minimum_required(VERSION 3.5)`
`2`	`2`	`project(timeloop_python)`
`3`	`3`
`4`	`4`	`if(DEFINED ENV{TIMELOOP_INCLUDE_PATH})`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from .latency import compute_latency`
	`1`	`+from .latency import get_latency`