Skip to content

Commit aa1c97c

Browse files
author
Michael Gilbert
committed
[looptree] Symbolic model can produce latency and energy
1 parent e64fbb9 commit aa1c97c

File tree

10 files changed

+142
-39
lines changed

10 files changed

+142
-39
lines changed

pytimeloop/looptree/accesses.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1-
from collections import defaultdict
2-
from collections.abc import Mapping
31
from dataclasses import dataclass
4-
from numbers import Number
2+
from functools import reduce
3+
from operator import mul
54
from typing import Optional, overload
65

76
from bindings.looptree import TemporalTag, SequentialTag, PipelineTemporalTag
@@ -61,10 +60,12 @@ def summarize_total_and_per_unit_actions(
6160
reuse_analysis_result
6261
):
6362
result = {}
63+
reads_to_parent = reuse_analysis_result.reads_to_parent
64+
reads_to_peer = reuse_analysis_result.reads_to_peer
6465
if isinstance(reuse_analysis_result, IslReuseAnalysisOutput):
6566
for key, (tags, fill) in reuse_analysis_result.fills.items():
66-
read_to_parent = reuse_analysis_result.reads_to_parent[key][1]
67-
read_to_peer = reuse_analysis_result.reads_to_peer[key][1]
67+
read_to_parent = reads_to_parent[key][1]
68+
read_to_peer = reads_to_peer[key][1]
6869

6970
total_fill = get_sum_of_pw_qpolynomial(fill)
7071
total_read_to_parent = get_sum_of_pw_qpolynomial(read_to_parent)
@@ -89,20 +90,28 @@ def summarize_total_and_per_unit_actions(
8990
max_per_unit_read_to_peer)
9091
elif isinstance(reuse_analysis_result, SummarizedAnalysisOutput):
9192
for key, (tags, fill) in reuse_analysis_result.fills.items():
92-
buffer_id = key[0]
93+
buffer_name = key[0]
9394

94-
read_to_parent = reuse_analysis_result.reads_to_parent[key][1]
95-
read_to_peer = reuse_analysis_result.reads_to_peer[key][1]
95+
if key in reads_to_parent:
96+
read_to_parent = reads_to_parent[key][1]
97+
else:
98+
read_to_parent = 0
99+
100+
if key in reads_to_peer:
101+
read_to_peer = reads_to_peer[key][1]
102+
else:
103+
read_to_peer = 0
96104

97105
total_fill = fill
98106
total_read_to_parent = read_to_parent
99107
total_read_to_peer = read_to_peer
100108

101-
fanout = reuse_analysis_result.fanout[buffer_id]
109+
fanout = reuse_analysis_result.fanout[buffer_name]
110+
total_fanout = reduce(mul, fanout, 1)
102111

103-
max_per_unit_fill = fill / fanout
104-
max_per_unit_read_to_parent = read_to_parent / fanout
105-
max_per_unit_read_to_peer = read_to_peer / fanout
112+
max_per_unit_fill = fill / total_fanout
113+
max_per_unit_read_to_parent = read_to_parent / total_fanout
114+
max_per_unit_read_to_peer = read_to_peer / total_fanout
106115

107116
result[key] = (total_fill,
108117
total_read_to_parent,

pytimeloop/looptree/energy.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from pytimeloop.looptree.mapping_utilities import *
88

99

10-
def gather_actions(looptree_results, mapping, workload, bindings, is_path=False):
10+
def gather_actions(looptree_results, mapping, workload, bindings, is_path=False, use_name=False):
1111
einsum_name_to_id = workload.einsum_name_to_id()
1212

1313
einsums_with_complete_mapping = \
@@ -23,7 +23,11 @@ def gather_actions(looptree_results, mapping, workload, bindings, is_path=False)
2323
is_path)
2424
actions = {}
2525
for (buf, tensor, einsum), accesses in accesses_stats.items():
26-
buf = bindings[buf]
26+
if use_name:
27+
buf = buf
28+
else:
29+
buf = bindings[buf]
30+
2731
key = (buf, 'read')
2832
if key not in actions:
2933
actions[key] = 0
@@ -65,7 +69,7 @@ def gather_ops(ops, einsums_with_complete_mapping):
6569
continue
6670
if isinstance(v, isl.PwQPolynomial):
6771
total += get_sum_of_pw_qpolynomial(v)
68-
elif isinstance(v, Number):
72+
elif isinstance(v, Real):
6973
total += v
7074
else:
7175
total += v

pytimeloop/looptree/latency/latency.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from typing import overload
2+
from sympy import Max
23

34
from pytimeloop.isl.singular import get_value_from_singular_qpolynomial
45
from pytimeloop.looptree.latency.processors import LATENCY_PROCESSORS
@@ -9,7 +10,7 @@
910
from bindings.looptree import SpatialTag
1011

1112

12-
def get_latency(looptree_results: IslReuseAnalysisOutput,
13+
def get_latency(looptree_results,
1314
mapping,
1415
workload,
1516
arch,
@@ -22,7 +23,7 @@ def get_latency(looptree_results: IslReuseAnalysisOutput,
2223
mapping,
2324
workload,
2425
bindings)
25-
overall_latency = max(comp_latency, max(mem_latency.values()))
26+
overall_latency = Max(comp_latency, Max(*mem_latency.values()))
2627
return overall_latency, comp_latency, mem_latency
2728

2829

@@ -57,7 +58,7 @@ def compute_isl_latency(temporal_steps, mapping, workload):
5758

5859
def compute_summarized_latency(temporal_steps, mapping, workload):
5960
# TODO: this is only for single-Einsum!!!
60-
return sum(value for key, value in temporal_steps)
61+
return sum(value for key, value in temporal_steps.items())
6162

6263

6364
def _compute_latency(mapping, top_idx: int, temporal_steps, workload):

pytimeloop/looptree/latency/memory.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from collections import defaultdict
2+
from sympy import Max, Min
23

34
from pytimeloop.looptree.accesses import buffer_accesses_from_buffet_actions
45
from pytimeloop.looptree.reuse.isl import IslReuseAnalysisOutput
@@ -21,6 +22,8 @@ def memory_latency(
2122

2223
component_to_read_writes = defaultdict(lambda: [None, None])
2324
for level, component in bindings.items():
25+
if isinstance(looptree_results, SummarizedAnalysisOutput):
26+
level = component
2427
read_count = 0
2528
write_count = 0
2629
for key, accesses in accesses_stats.items_with_buffer(level):
@@ -47,21 +50,23 @@ def memory_latency(
4750
# All shared bw for writing
4851
write_latency = writes / (write_bw + shared_bw)
4952
read_latency = reads / read_bw
50-
if write_latency >= read_latency:
51-
component_latency[component] = write_latency
52-
continue
53+
all_shared_for_write_latency = Max(write_latency, read_latency)
54+
5355
# All shared bw for reading
5456
write_latency = writes / write_bw
5557
read_latency = reads / (read_bw + shared_bw)
56-
if read_latency >= write_latency:
57-
component_latency[component] = read_latency
58-
continue
58+
all_shared_for_read_latency = Max(write_latency, read_latency)
59+
5960
# Shared bw shared for reading and writing
60-
component_latency[component] = (
61+
shared_for_read_and_write_latency = (
6162
(reads + writes)
6263
/
6364
(read_bw + write_bw + shared_bw)
6465
)
66+
67+
component_latency[component] = Min(all_shared_for_write_latency,
68+
all_shared_for_read_latency,
69+
shared_for_read_and_write_latency)
6570
return component_latency
6671

6772

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
from .symbolic import SummarizedAnalysisOutput
1+
from .symbolic import SummarizedAnalysisOutput, analyze_reuse
22
from .compiler import compile_analysis_result

pytimeloop/looptree/reuse/summarized/symbolic.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,11 @@ class SummarizedAnalysisOutput:
2626
op_intensity: dict = field(default_factory=dict)
2727

2828

29-
def analyze_reuse(mapping,
30-
workload: LooptreeWorkload,
31-
analyzer: LooptreeWorkloadDependencyAnalyzer):
29+
def analyze_reuse(
30+
mapping,
31+
workload: LooptreeWorkload,
32+
analyzer: LooptreeWorkloadDependencyAnalyzer
33+
) -> SummarizedAnalysisOutput:
3234
einsum_name_to_id = workload.einsum_name_to_id()
3335
rank_name_to_id = workload.dimension_name_to_id()
3436
tensor_name_to_id = workload.data_space_name_to_id()
@@ -73,7 +75,7 @@ def analyze_reuse(mapping,
7375

7476
tile_shapes = []
7577

76-
output = IslReuseAnalysisOutput()
78+
output = SummarizedAnalysisOutput()
7779

7880
latency = 1
7981
potential_tensor_access_multiplier = defaultdict(lambda: 1)

pytimeloop/looptree/run.py

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44
import islpy as isl
55

66
from bindings.config import Config
7-
from bindings.looptree import LooptreeModelApp, LooptreeWorkload
7+
from bindings.looptree import LooptreeModelApp, LooptreeWorkload, LooptreeWorkloadDependencyAnalyzer
88

99
from pytimeloop.file import gather_yaml_configs
1010

1111
from pytimeloop.looptree.capacity import compute_capacity_usage
1212
from pytimeloop.looptree.reuse.isl.des import deserialize_looptree_output
13+
from pytimeloop.looptree.reuse.summarized import analyze_reuse
1314
from pytimeloop.looptree.energy import gather_actions, compute_energy_from_actions
1415
from pytimeloop.looptree.latency import get_latency
1516

@@ -69,3 +70,49 @@ def run_looptree(config_dir, paths, tmp_path, bindings, call_accelergy):
6970
actions,
7071
mem_latency,
7172
capacity_usage=component_capacity_usage)
73+
74+
75+
def run_looptree_symbolic(config_dir, paths, tmp_path, bindings, call_accelergy):
76+
yaml_str = gather_yaml_configs(config_dir, paths)
77+
78+
config = Config(yaml_str, 'yaml')
79+
workload = LooptreeWorkload.parse_cfg(config.root['problem'])
80+
analyzer = LooptreeWorkloadDependencyAnalyzer(workload)
81+
82+
spec = Specification.from_yaml_files([
83+
str(config_dir / p) for p in paths
84+
])
85+
86+
if call_accelergy:
87+
if isinstance(tmp_path, Path):
88+
tmp_path = str(tmp_path)
89+
call_accelergy_verbose(spec, tmp_path)
90+
spec = Specification.from_yaml_files([
91+
str(config_dir / p) for p in paths
92+
] + [str(Path(tmp_path) / 'ERT.yaml')])
93+
94+
95+
tile_shapes, result = analyze_reuse(spec.mapping.nodes, workload, analyzer)
96+
97+
actions = gather_actions(result, spec.mapping, workload, bindings, use_name=True)
98+
energy = compute_energy_from_actions(actions, spec.ERT)
99+
100+
latency, comp_latency, mem_latency = get_latency(result,
101+
spec.mapping,
102+
workload,
103+
spec.architecture,
104+
bindings)
105+
106+
capacity_usage = compute_capacity_usage(spec.mapping.nodes,
107+
result.occupancy,
108+
workload)
109+
component_capacity_usage = {}
110+
for level, component in bindings.items():
111+
if level in capacity_usage:
112+
component_capacity_usage[component] = capacity_usage[level]
113+
114+
return LoopTreeStatistics(latency,
115+
energy,
116+
actions,
117+
mem_latency,
118+
capacity_usage=component_capacity_usage)

tests/looptree/reuse_analysis/test_symbolic.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@ def test_model_with_two_level_mm(self):
2929
P1_tile_shape, C1_tile_shape, M1_tile_shape = tile_shapes
3030

3131
REFERENCE_FILLS = {
32-
('DRAM', 0, 0): (None, 18),
33-
('DRAM', 1, 0): (None, 8),
34-
('DRAM', 2, 0): (None, 36),
32+
('MainMemory', 0, 0): (None, 18),
33+
('MainMemory', 1, 0): (None, 8),
34+
('MainMemory', 2, 0): (None, 36),
3535
('GlobalBuffer', 0, 0): (None, 18.0*ceiling(4/M1_tile_shape)),
3636
('GlobalBuffer', 1, 0): (None, 8)
3737
}
@@ -61,9 +61,9 @@ def test_model_with_two_level_mm(self):
6161
for M1_tile_shape_val in [1, 2, 4]:
6262

6363
REFERENCE_FILLS = {
64-
('DRAM', 0, 0): 18,
65-
('DRAM', 1, 0): 8,
66-
('DRAM', 2, 0): 36,
64+
('MainMemory', 0, 0): 18,
65+
('MainMemory', 1, 0): 8,
66+
('MainMemory', 2, 0): 36,
6767
('GlobalBuffer', 0, 0): 18.0*ceil(4/M1_tile_shape_val),
6868
('GlobalBuffer', 1, 0): 8
6969
}

tests/looptree/test_run.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
from pathlib import Path
2+
from pprint import pp
23
import unittest
34

4-
from pytimeloop.looptree.run import run_looptree
5+
from bindings.looptree import LooptreeWorkload, LooptreeWorkloadDependencyAnalyzer
56

7+
from pytimeloop.looptree.run import run_looptree, run_looptree_symbolic
8+
9+
from tests.load_config_mixin import LoadConfigMixin
610
from tests.util import TEST_TMP_DIR
711

812

@@ -40,3 +44,34 @@ def test_fused_sequential(self):
4044

4145
for k, v in stats.energy.items():
4246
self.assertAlmostEqual(ENERGY_REFS[k], v, 1)
47+
48+
49+
class TestLooptreeSymbolic(unittest.TestCase, LoadConfigMixin):
50+
def test_two_level_mm(self):
51+
BINDINGS = {
52+
0: 'MainMemory',
53+
1: 'GlobalBuffer',
54+
2: 'GlobalBuffer',
55+
3: 'GlobalBuffer',
56+
4: 'MACC'
57+
}
58+
59+
stats = run_looptree_symbolic(
60+
Path(__file__).parent.parent / 'test_configs',
61+
[
62+
'symbolic-mapping.yaml',
63+
'cascaded_mm.workload.yaml',
64+
'three_level.arch.yaml'
65+
],
66+
TEST_TMP_DIR,
67+
BINDINGS,
68+
True
69+
)
70+
71+
ACTION_REFS = {
72+
('MainMemory', 'read'): 26,
73+
('MainMemory', 'write'): 36,
74+
('MACC', 'compute'): 72
75+
}
76+
for key, ref_value in ACTION_REFS.items():
77+
self.assertEqual(stats.actions[key], ref_value)

tests/test_configs/symbolic-mapping.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ mapping:
22
type: fused
33
nodes:
44
- type: storage
5-
target: DRAM
5+
target: MainMemory
66
dspace: [Filter1, Fmap1, Fmap2]
77
- type: storage
88
target: GlobalBuffer

0 commit comments

Comments
 (0)