Skip to content

Commit 56224de

Browse files
author
Michael Gilbert
committed
[looptree] New latency model and updated returned statistics
1 parent 9796f2f commit 56224de

File tree

10 files changed

+215
-37
lines changed

10 files changed

+215
-37
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cmake_minimum_required(VERSION 3.4)
1+
cmake_minimum_required(VERSION 3.5)
22
project(timeloop_python)
33

44
if(DEFINED ENV{TIMELOOP_INCLUDE_PATH})

pytimeloop/isl/reduction.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,16 @@ def make_reduction_map(space, dims_out_first, n_dims_out):
66
"""
77
return isl.Map.identity(space.map_from_set())\
88
.project_out(isl.dim_type.in_, dims_out_first, n_dims_out)
9+
10+
11+
def make_reduction_map_from_mask(space, mask):
12+
"""
13+
Creates a reduction map that removes dimensions marked True in `mask`,
14+
e.g., if mask is [False, True, False] and space is { [x, y, z] }, the
15+
result is { [x, z] -> [x, y, z] }
16+
"""
17+
isl_map = isl.Map.identity(space.map_from_set())
18+
for i, is_reduced in reversed(list(enumerate(mask))):
19+
if is_reduced:
20+
isl_map = isl_map.project_out(isl.dim_type.in_, i, 1)
21+
return isl_map

pytimeloop/isl/sum.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1+
from collections.abc import Sequence
12
import islpy as isl
23

3-
from .reduction import make_reduction_map
4+
from .reduction import make_reduction_map, make_reduction_map_from_mask
45

56

67
def sum_until_idx(n_dims_left: int, pw_qp):
@@ -11,3 +12,9 @@ def sum_until_idx(n_dims_left: int, pw_qp):
1112
dims_out_first,
1213
n_dims_out)
1314
return reduction_map.apply_pw_qpolynomial(pw_qp)
15+
16+
17+
def sum_with_mask(mask: Sequence[bool], pw_qp):
18+
reduction_map = make_reduction_map_from_mask(pw_qp.get_domain_space(),
19+
mask)
20+
return reduction_map.apply_pw_qpolynomial(pw_qp)

pytimeloop/looptree/accesses.py

Lines changed: 60 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@
22
from collections.abc import Mapping
33
from numbers import Number
44

5+
from bindings.looptree import TemporalTag, SequentialTag, PipelineTemporalTag
6+
57
import islpy as isl
68

79
from pytimeloop.isl.singular import get_sum_of_pw_qpolynomial
10+
from pytimeloop.isl.sum import sum_with_mask
811
from pytimeloop.looptree.mapping_utilities import *
912

1013

@@ -29,7 +32,8 @@ def reads_and_writes_from_fill_by_parent(fills: Mapping,
2932
reads_to_parent,
3033
mapping,
3134
workload,
32-
is_path=False):
35+
is_path=False,
36+
per_unit=False):
3337
mapping = mapping['nodes']
3438
dspace_id_to_name = workload.data_space_id_to_name()
3539
einsum_id_to_name = workload.einsum_id_to_name()
@@ -49,8 +53,33 @@ def reads_and_writes_from_fill_by_parent(fills: Mapping,
4953
for (buffer_id, dspace_id, einsum_id), (tags, fill) in fills.items():
5054
read_to_parent = reads_to_parent[(buffer_id, dspace_id, einsum_id)][1]
5155

52-
read_to_parent = get_sum_of_pw_qpolynomial(read_to_parent)
53-
fill = get_sum_of_pw_qpolynomial(fill)
56+
if not per_unit:
57+
read_to_parent = get_sum_of_pw_qpolynomial(read_to_parent)
58+
fill = get_sum_of_pw_qpolynomial(fill)
59+
else:
60+
fill = sum_with_mask(
61+
[
62+
(
63+
isinstance(t, TemporalTag) or
64+
isinstance(t, PipelineTemporalTag) or
65+
isinstance(t, SequentialTag)
66+
)
67+
for t in tags
68+
],
69+
fill
70+
).max().to_python()
71+
n_read_to_parent_dim = read_to_parent.dim(isl.dim_type.in_)
72+
read_to_parent = sum_with_mask(
73+
[
74+
(
75+
isinstance(t, TemporalTag) or
76+
isinstance(t, PipelineTemporalTag) or
77+
isinstance(t, SequentialTag)
78+
)
79+
for t in tags[:n_read_to_parent_dim]
80+
],
81+
read_to_parent
82+
).max().to_python()
5483

5584
dspace_name = dspace_id_to_name[dspace_id]
5685
einsum_name = einsum_id_to_name[einsum_id]
@@ -61,24 +90,32 @@ def reads_and_writes_from_fill_by_parent(fills: Mapping,
6190
key = (parent_buffer, dspace_name, einsum_name)
6291
if dspace_id in workload.tensors_written_by_einsum(einsum_id):
6392
writes[key] += read_to_parent
93+
reads[key] += read_to_parent
6494
# Subtracted term: elided first read of a read-write tensor
65-
reads[key] += \
66-
read_to_parent - workload.get_tensor_volume(dspace_id)
95+
# TODO: figure out how to do this per unit
96+
if not per_unit:
97+
reads[key] -= workload.get_tensor_volume(dspace_id)
6798
elif dspace_id in workload.tensors_read_by_einsum(einsum_id):
6899
reads[key] += read_to_parent
69100
# Fills will write into current buffer except for compute (which does
70101
# not have write action) and top-level buffer
71102
if buffer_id not in compute_targets and parent_buffer is not None:
72103
if dspace_id in workload.tensors_written_by_einsum(einsum_id):
73-
writes[(buffer_id, dspace_name, einsum_name)] += \
74-
fill - workload.get_tensor_volume(dspace_id)
104+
writes[(buffer_id, dspace_name, einsum_name)] += fill
105+
if not per_unit:
106+
writes[(buffer_id, dspace_name, einsum_name)] -= \
107+
workload.get_tensor_volume(dspace_id)
75108
else:
76109
writes[(buffer_id, dspace_name, einsum_name)] += fill
77110

78111
return reads, writes
79112

80113

81-
def reads_and_writes_from_fill_by_peer(fills: Mapping, mapping, workload, is_path=False):
114+
def reads_and_writes_from_fill_by_peer(fills: Mapping,
115+
mapping,
116+
workload,
117+
is_path=False,
118+
per_unit=False):
82119
mapping = mapping['nodes']
83120
dspace_id_to_name = workload.data_space_id_to_name()
84121
einsum_id_to_name = workload.einsum_id_to_name()
@@ -89,14 +126,27 @@ def reads_and_writes_from_fill_by_peer(fills: Mapping, mapping, workload, is_pat
89126
einsums_with_complete_mappings = get_einsums_with_complete_mappings(mapping, workload, is_path)
90127

91128
for (buffer_id, dspace_id, einsum_id), (tags, fill) in fills.items():
92-
fill = get_sum_of_pw_qpolynomial(fill)
129+
if not per_unit:
130+
fill = get_sum_of_pw_qpolynomial(fill)
131+
else:
132+
fill = sum_with_mask(
133+
[
134+
(
135+
isinstance(t, TemporalTag) or
136+
isinstance(t, PipelineTemporalTag) or
137+
isinstance(t, SequentialTag)
138+
)
139+
for t in tags
140+
],
141+
fill
142+
).max().to_python()
93143
einsum_name = einsum_id_to_name[einsum_id]
94144
dspace_name = dspace_id_to_name[dspace_id]
95145
if einsum_id not in einsums_with_complete_mappings:
96146
continue
97147

98148
reads[(buffer_id, dspace_name, einsum_name)] = fill
99-
writes[(buffer_id, dspace_name, einsum_name)] = 0 # already accounted for in above
149+
writes[(buffer_id, dspace_name, einsum_name)] = 0 # already accounted for in fill_by_parent
100150

101151
return reads, writes
102152

pytimeloop/looptree/energy.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ def gather_actions(looptree_results, mapping, workload, bindings, is_path=False)
1212

1313
einsum_name_to_id = workload.einsum_name_to_id()
1414

15-
einsums_with_complete_mapping = get_einsums_with_complete_mappings(mapping['nodes'], workload, is_path)
15+
einsums_with_complete_mapping = \
16+
get_einsums_with_complete_mappings(mapping['nodes'], workload, is_path)
1617
einsums_with_complete_mapping = {
1718
e if isinstance(e, int) else einsum_name_to_id[e]
1819
for e in einsums_with_complete_mapping
@@ -94,7 +95,7 @@ def gather_ops(ops, einsums_with_complete_mapping):
9495
if einsum_id not in einsums_with_complete_mapping:
9596
continue
9697
if isinstance(v, isl.PwQPolynomial):
97-
total += get_sum_of_pw_qpolynomial(v).to_python()
98+
total += get_sum_of_pw_qpolynomial(v)
9899
elif isinstance(v, Number):
99100
total += v
100101
else:
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
from .latency import compute_latency
1+
from .latency import get_latency

pytimeloop/looptree/latency/latency.py

Lines changed: 62 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,84 @@
11
from collections import defaultdict
22

33
from pytimeloop.isl.singular import get_value_from_singular_qpolynomial
4+
from pytimeloop.looptree.accesses import (
5+
reads_and_writes_from_fill_by_parent,
6+
reads_and_writes_from_fill_by_peer
7+
)
48
from pytimeloop.looptree.latency.processors import LATENCY_PROCESSORS
9+
from pytimeloop.looptree.des import LooptreeOutput
510

611
from bindings.looptree import SpatialTag
712

813

9-
def get_latency(actions, mapping, temporal_steps, workload, arch):
10-
comp_latency = compute_latency(mapping, temporal_steps, workload)
11-
mem_latency = memory_latency(actions, arch)
12-
return max(comp_latency, max(mem_latency.values()))
14+
def get_latency(looptree_results: LooptreeOutput,
15+
mapping,
16+
workload,
17+
arch,
18+
bindings):
19+
comp_latency = compute_latency(mapping,
20+
looptree_results.temporal_steps,
21+
workload)
22+
mem_latency = memory_latency(looptree_results,
23+
arch,
24+
mapping,
25+
workload,
26+
bindings)
27+
overall_latency = max(comp_latency, max(mem_latency.values()))
28+
return overall_latency, comp_latency, mem_latency
1329

1430

1531
def compute_latency(mapping, temporal_steps, workload):
1632
return get_value_from_singular_qpolynomial(
17-
_compute_latency(mapping, 0, temporal_steps, workload)[1]
33+
_compute_latency(mapping.nodes, 0, temporal_steps, workload)[1]
1834
).to_python()
1935

2036

21-
def memory_latency(actions, arch):
37+
def memory_latency(looptree_results: LooptreeOutput,
38+
arch,
39+
mapping,
40+
workload,
41+
bindings):
42+
reads, writes = reads_and_writes_from_fill_by_parent(
43+
looptree_results.fills,
44+
looptree_results.reads_to_parent,
45+
mapping,
46+
workload,
47+
per_unit=True
48+
)
49+
50+
peer_reads, peer_writes = reads_and_writes_from_fill_by_peer(
51+
looptree_results.reads_to_peer,
52+
mapping,
53+
workload,
54+
per_unit=True
55+
)
56+
2257
component_to_read_writes = defaultdict(lambda: [None, None])
23-
for (component, action), count in actions.items():
24-
if action == 'read':
25-
component_to_read_writes[component][0] = count
26-
elif action == 'write':
27-
component_to_read_writes[component][1] = count
58+
for level, component in bindings.items():
59+
read_count = sum(reads[key] for key in reads if key[0] == level)
60+
read_count += sum(peer_reads[key]
61+
for key in peer_reads if key[0] == level)
62+
write_count = sum(writes[key] for key in writes if key[0] == level)
63+
write_count += sum(peer_writes[key]
64+
for key in peer_writes if key[0] == level)
65+
if component not in component_to_read_writes:
66+
component_to_read_writes[component][0] = read_count
67+
component_to_read_writes[component][1] = write_count
68+
else:
69+
component_to_read_writes[component][0] += read_count
70+
component_to_read_writes[component][1] += write_count
2871

2972
component_latency = {}
3073
bandwidths = get_bandwidth(arch)
3174
for component, (reads, writes) in component_to_read_writes.items():
3275
read_bw, write_bw, shared_bw = bandwidths[component]
76+
77+
# For numerical stability
78+
read_bw += 1e-8
79+
write_bw += 1e-8
80+
shared_bw += 1e-8
81+
3382
# All shared bw for writing
3483
write_latency = writes / (write_bw + shared_bw)
3584
read_latency = reads / read_bw
@@ -58,6 +107,8 @@ def get_bandwidth(arch):
58107
n_rd_ports = attributes.get('n_rd_ports', 0)
59108
n_wr_ports = attributes.get('n_wr_ports', 0)
60109
n_rdwr_ports = attributes.get('n_rdwr_ports', 0)
110+
if n_rd_ports + n_wr_ports + n_rdwr_ports < 1:
111+
n_rdwr_ports = 1
61112

62113
width = attributes['width']
63114
datawidth = attributes['datawidth']

pytimeloop/looptree/run.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from dataclasses import dataclass
12
from pathlib import Path
23

34
import islpy as isl
@@ -7,14 +8,24 @@
78

89
from pytimeloop.file import gather_yaml_configs
910

11+
from pytimeloop.looptree.capacity import compute_capacity_usage
1012
from pytimeloop.looptree.des import deserialize_looptree_output
1113
from pytimeloop.looptree.energy import gather_actions, compute_energy_from_actions
12-
from pytimeloop.looptree.latency import compute_latency
14+
from pytimeloop.looptree.latency import get_latency
1315

1416
from pytimeloop.timeloopfe.v4fused import Specification
1517
from pytimeloop.timeloopfe.common.backend_calls import call_accelergy_verbose
1618

1719

20+
@dataclass
21+
class LoopTreeStatistics:
22+
latency: float
23+
energy: float
24+
actions: dict
25+
memory_latency: dict
26+
capacity_usage: dict
27+
28+
1829
def run_looptree(config_dir, paths, tmp_path, bindings, call_accelergy):
1930
yaml_str = gather_yaml_configs(config_dir, paths)
2031
config = Config(yaml_str, 'yaml')
@@ -39,8 +50,22 @@ def run_looptree(config_dir, paths, tmp_path, bindings, call_accelergy):
3950
actions = gather_actions(result, spec.mapping, workload, bindings)
4051
energy = compute_energy_from_actions(actions, spec.ERT)
4152

42-
latency = compute_latency(spec.mapping.nodes,
43-
result.temporal_steps,
44-
workload)
53+
latency, comp_latency, mem_latency = get_latency(result,
54+
spec.mapping,
55+
workload,
56+
spec.architecture,
57+
bindings)
58+
59+
capacity_usage = compute_capacity_usage(spec.mapping.nodes,
60+
result.occupancy,
61+
workload)
62+
component_capacity_usage = {}
63+
for level, component in bindings.items():
64+
if level in capacity_usage:
65+
component_capacity_usage[component] = capacity_usage[level]
4566

46-
return latency, energy
67+
return LoopTreeStatistics(latency,
68+
energy,
69+
actions,
70+
mem_latency,
71+
capacity_usage=component_capacity_usage)

tests/looptree/test_latency.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from pathlib import Path
2+
import unittest
3+
4+
from pytimeloop.looptree.run import run_looptree
5+
6+
from tests.util import TEST_TMP_DIR
7+
8+
9+
class TestLatency(unittest.TestCase):
10+
def test_fused_sequential(self):
11+
BINDINGS = {
12+
0: 'MainMemory',
13+
1: 'GlobalBuffer',
14+
2: 'GlobalBuffer',
15+
3: 'GlobalBuffer',
16+
4: 'MACC'
17+
}
18+
19+
stats = run_looptree(
20+
Path(__file__).parent.parent / 'test_configs',
21+
[
22+
'looptree-test-fused.yaml',
23+
'cascaded_mm.workload.yaml',
24+
'three_level.arch.yaml'
25+
],
26+
TEST_TMP_DIR,
27+
BINDINGS,
28+
True
29+
)
30+
31+
self.assertEqual(54, stats.latency)

0 commit comments

Comments
 (0)