Skip to content

Commit 7a23dae

Browse files
Add hier_mult cmd line arg for setting hierarchy multiplier used in default pipelining iterations. Find optimal hierarchy multiplier value for RT on ECP5. Detect identical unchanged timing results as bad and do something different. Try to tune other pipelining magic numbers, disabling submodule attempts at higher fmax compensation, coarse sweep latency increase jump size decreased by half. Always use nextpnr default router.
1 parent fe22dc6 commit 7a23dae

File tree

3 files changed

+140
-50
lines changed

3 files changed

+140
-50
lines changed

src/OPEN_TOOLS.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -370,11 +370,12 @@ def SYN_AND_REPORT_TIMING_NEW(
370370
m_ghdl = ""
371371
if not GHDL_PLUGIN_BUILT_IN:
372372
m_ghdl = "-m ghdl "
373-
optional_router2 = "--router router2"
374-
if inst_name:
375-
# Dont use router two for small single instances
376-
# Only use router two for multi main top level no inst_name
377-
optional_router2 = ""
373+
optional_router2 = "" # Always default router for now...
374+
#optional_router2 = "--router router2"
375+
#if inst_name:
376+
# # Dont use router two for small single instances
377+
# # Only use router two for multi main top level no inst_name
378+
# optional_router2 = ""
378379
sh_file = top_entity_name + ".sh"
379380
sh_path = output_directory + "/" + sh_file
380381
f = open(sh_path, "w")

src/SYN.py

Lines changed: 102 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,10 @@
3737
# Welcome to the land of magic numbers
3838
# "But I think its much worse than you feared" Modest Mouse - I'm Still Here
3939

40-
#WTF?
40+
# Artix 7 100t full RT:
41+
# Coarse sweep BEST IS sliced 494 clocks, get 490 clock pipeline at 153MHz
42+
# Coarse (not sweep) gets to 504 slice, 500 clk, 10 clks off fine for now...
43+
# With BEST_GUESS_MUL_MAX = 10.0, COARSE_SWEEP_MULT_MAX = 1.5, not skipping higher fmax to compensate
4144
# RT pixel_logic
4245
# HIER_SWEEP_MULT_MIN latency # top runs
4346
# 0.125 470 32
@@ -57,23 +60,53 @@
5760
# 2.0 585 9
5861
# 2.25 493 8
5962
# 2.5 509 7
63+
#=== After Recent updates
64+
# 2.0? 536 5
65+
# 1.9375 343 8
66+
# 1.5 368 8
67+
# 1.0 417 15
68+
# 0.5 ~417 ~45 reaches 1.0x start after ~33runs
69+
# 0.0 ~417 ~65 reaches 1.0x after ~55 runs
70+
71+
# ECP5 Full RT:
72+
# Coarse sweep gets to: 75 clocks
73+
# Coarse (not sweep) gets to 81 clocks
74+
# HIER_SWEEP_MULT_MIN latency # top runs
75+
# 2.0 80+clks never finished ~24hrs hash=5c32
76+
# 1.9375 ^same
77+
# 1.5 83+clks never finished ~12 hours hash=a39f
78+
# 1.0 63+clks never finished ~12 hrs hash=e748
79+
# 0.5 45+clks never finished ~12 hr hash=22ef
80+
# 0.25 60+clks never finished 12+hours hash=5237
81+
# 0.181 79+clks never finished 12+hours hash=222f
82+
# 0.125 ^same
83+
# 0.09375 ^same
84+
# 0.078125 \/same
85+
# 0.06780026720106881
86+
# 0.0625 66 4 ^ ends up at slighty higher after failing coarse
87+
# 0.046875 ^same
88+
# 0.03125 \/same
89+
# 0.0 70 2 # wow
90+
91+
# Tool increases HIER_SWEEP_MULT_MIN if it has trouble
92+
# i.e. tries to pipeline smaller/less logic modules, easier task
93+
# Starting off too large is bad - no way to recover
6094

61-
MAX_N_WORSE_RESULTS_MULT = 16 # Multiplier for how many times failing to improve before moving on? divided by total latnecy
62-
BEST_GUESS_MUL_MAX = 10.0 # Multiplier limit on top down register insertion coarsly during middle out sweep
63-
MAX_ALLOWED_LATENCY_MULT = (
64-
10 # Multiplier limit for individual module coarse register insertion coarsely, similar same as BEST_GUESS_MUL_MAX?
65-
)
6695
# Target pipelining at minimum at modules of period = target/MULT
96+
# 0.0 -> start w/ largest/top level modules first
6797
# 0.5 -> 2 times the target period (not meeting timing),
6898
# 2.0 -> 1/2 the target period (easily meets timing)
69-
HIER_SWEEP_MULT_MIN = (
70-
1.9375
71-
)
99+
HIER_SWEEP_MULT_MIN = None # Cmd line arg sets
72100
HIER_SWEEP_MULT_INC = (
73-
0.001 # Intentionally very small, sweep tries to make largest possible steps
101+
0.001 # Intentionally very small, sweep already tries to make largest possible steps
102+
)
103+
MAX_N_WORSE_RESULTS_MULT = 16 # Multiplier for how many times failing to improve before moving on? divided by total latnecy
104+
BEST_GUESS_MUL_MAX = 5.0 # Multiplier limit on top down register insertion coarsly during middle out sweep
105+
MAX_ALLOWED_LATENCY_MULT = (
106+
5.0 # Multiplier limit for individual module coarse register insertion coarsely, similar same as BEST_GUESS_MUL_MAX?
74107
)
75108
COARSE_SWEEP_MULT_INC = 0.1 # Multiplier increment for how many times fmax to try for compensating not meeting timing
76-
COARSE_SWEEP_MULT_MAX = 1.5 # Max multiplier for internal fmax
109+
COARSE_SWEEP_MULT_MAX = 1.0 # ==1.0 disables trying to pipeline to fake higher fmax, Max multiplier for internal fmax
77110
MAX_CLK_INC_RATIO = 1.25 # Multiplier for how any extra clocks can be added ex. 1.25 means 25% more stages max
78111
DELAY_UNIT_MULT = 10.0 # Timing is reported in nanoseconds. Multiplier to convert that time into integer units (nanosecs, tenths, hundreds of nanosecs)
79112
INF_MHZ = 1000 # Impossible timing goal
@@ -2107,8 +2140,9 @@ class InstSweepState:
21072140
def __init__(self):
21082141
self.met_timing = False
21092142
self.timing_report = None # Current timing report with multiple paths
2110-
self.mhz_to_latency = dict() # dict[mhz] = latency
2111-
self.latency_to_mhz = dict() # dict[latency] = mhz
2143+
self.mhz_to_latency = dict() # BEST dict[mhz] = latency
2144+
self.latency_to_mhz = dict() # BEST dict[latency] = mhz
2145+
self.last_mhz = None
21122146

21132147
# Coarse grain sweep
21142148
self.coarse_latency = None
@@ -3377,6 +3411,8 @@ def hier_sweep_mult_func(target_path_delay_ns, func_path_delay_ns):
33773411
better_mhz = curr_mhz > best_mhz_so_far
33783412
better_latency = curr_mhz > best_mhz_this_latency
33793413
# Log result
3414+
got_same_mhz_again = curr_mhz == sweep_state.inst_sweep_state[main_inst].last_mhz
3415+
sweep_state.inst_sweep_state[main_inst].last_mhz = curr_mhz
33803416
if better_mhz or better_latency:
33813417
sweep_state.inst_sweep_state[main_inst].mhz_to_latency[
33823418
curr_mhz
@@ -3403,19 +3439,65 @@ def hier_sweep_mult_func(target_path_delay_ns, func_path_delay_ns):
34033439
].best_guess_sweep_mult
34043440
* latency_mult
34053441
)
3406-
if (
3442+
3443+
# TODO: Very much need to organize this
3444+
3445+
# Getting exactly same MHz is bad sign, that didnt pipeline current modules right
3446+
if got_same_mhz_again:
3447+
print("Got identical timing result, trying to pipeline smaller modules instead...")
3448+
# Dont compensate with higher fmax, start with original coarse grain compensation on smaller modules
3449+
# WTF float stuff end up with slice getting repeatedly set just close enough not to slice next level down ?
3450+
if (
3451+
sweep_state.inst_sweep_state[
3452+
main_func
3453+
].smallest_not_sliced_hier_mult
3454+
== sweep_state.inst_sweep_state[
3455+
main_func
3456+
].hier_sweep_mult
3457+
):
3458+
sweep_state.inst_sweep_state[
3459+
main_func
3460+
].hier_sweep_mult += HIER_SWEEP_MULT_INC
3461+
print(
3462+
"Nudging hierarchy sweep multiplier:",
3463+
sweep_state.inst_sweep_state[
3464+
main_func
3465+
].hier_sweep_mult,
3466+
)
3467+
else:
3468+
sweep_state.inst_sweep_state[
3469+
main_inst
3470+
].hier_sweep_mult = sweep_state.inst_sweep_state[
3471+
main_inst
3472+
].smallest_not_sliced_hier_mult
3473+
print(
3474+
"Hierarchy sweep multiplier:",
3475+
sweep_state.inst_sweep_state[
3476+
main_inst
3477+
].hier_sweep_mult,
3478+
)
3479+
sweep_state.inst_sweep_state[
3480+
main_inst
3481+
].best_guess_sweep_mult = 1.0
3482+
sweep_state.inst_sweep_state[
3483+
main_inst
3484+
].coarse_sweep_mult = 1.0
3485+
made_adj = True
3486+
elif (
34073487
new_best_guess_sweep_mult > BEST_GUESS_MUL_MAX
34083488
): # 15 like? main_max_allowed_latency_mult 2.0 magic?
34093489
# Fail here, increment sweep mut and try_to_slice logic will slice lower module next time
34103490
print(
3411-
"Middle sweep at this hierarchy level failed to meet timing, trying to pipeline current modules to higher fmax to compensate..."
3491+
"Middle sweep at this hierarchy level failed to meet timing...",
3492+
"Next best guess multiplier was", new_best_guess_sweep_mult
34123493
)
34133494
if (
34143495
sweep_state.inst_sweep_state[
34153496
main_inst
34163497
].coarse_sweep_mult
34173498
+ COARSE_SWEEP_MULT_INC
3418-
) <= COARSE_SWEEP_MULT_MAX: # 1.5: # MAGIC?
3499+
) <= COARSE_SWEEP_MULT_MAX:
3500+
print("Trying to pipeline current modules to higher fmax to compensate...")
34193501
sweep_state.inst_sweep_state[
34203502
main_inst
34213503
].best_guess_sweep_mult = 1.0
@@ -3766,6 +3848,7 @@ def DO_COARSE_THROUGHPUT_SWEEP(
37663848
print(
37673849
logic.func_name,
37683850
"reached maximum allowed latency, no more adjustments...",
3851+
"multiplier =",max_allowed_latency_mult
37693852
)
37703853
continue
37713854

@@ -3787,7 +3870,8 @@ def DO_COARSE_THROUGHPUT_SWEEP(
37873870
and clk_inc >= inst_sweep_state.last_latency_increase
37883871
):
37893872
# Clip to last inc size - 1, minus one to always be narrowing down
3790-
clk_inc = inst_sweep_state.last_latency_increase - 1
3873+
# Extra div by 2 helps?
3874+
clk_inc = int(inst_sweep_state.last_latency_increase/2) # - 1
37913875
if clk_inc <= 0:
37923876
clk_inc = 1
37933877
clks = inst_sweep_state.coarse_latency + clk_inc
@@ -4723,4 +4807,4 @@ def GET_VHDL_FILES_TCL_TEXT_AND_TOP(
47234807
# Use next insts as current
47244808
inst_names = set(next_inst_names)
47254809

4726-
return files_txt, top_entity_name
4810+
return files_txt, top_entity_name

src/pipelinec

Lines changed: 32 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,10 @@ parser.add_argument("--modelsim", help="Setup simulation files for use with Mode
3939
parser.add_argument("--cxxrtl", help="Setup simulation files for use with GHDL+Yosys+CXXRTL.", action="store_true")
4040
parser.add_argument("--verilator", help="Setup simulation files for use with GHDL+Yosys+Verilator.", action="store_true")
4141
parser.add_argument("--main_cpp", type=str, default=None,help=f"Specify an existing main C++ file for simulators such as CXXRTL and Verilator.")
42+
parser.add_argument("--hier_mult", type=float, default=0.0,
43+
help="Hierarchy sweep multiplier minimum starting value ~= 'what modules to focus pipelining efforts on to get to desired fmax'. 0.0 = starts with largest/slowest modules (default). 0.5 = modules half as fast fmax. 2.0 = modules twice as fast as fmax, etc.")
4244
parser.add_argument("--coarse",
43-
help="Do only a single coarse grain autopipelining sweep (as opposed to default of several multi-level coarse sweeps).",
45+
help="Do only a single coarse grain autopipelining sweep (as opposed to default of hierarchical several multi-level coarse sweeps).",
4446
action="store_true")
4547
parser.add_argument("--start", type=int, default=None, help="When doing a --coarse sweep, start from START cycles latency.")
4648
parser.add_argument("--stop", type=int, default=None, help="When doing a --coarse sweep, stop at STOP cycles latency.")
@@ -64,6 +66,8 @@ c_file = os.path.abspath(args.c_file)
6466
SYN.TOP_LEVEL_MODULE = args.top
6567
# Flag for verilog conversion
6668
SYN.CONVERT_FINAL_TOP_VERILOG = args.verilog
69+
# Hierarchy sweep const
70+
SYN.HIER_SWEEP_MULT_MIN = args.hier_mult
6771
# Output dir:
6872
# Use what user provided
6973
# Or create a new dir for each run by default
@@ -134,37 +138,40 @@ SIM.DO_OPTIONAL_SIM(args.sim, parser_state, args, multimain_timing_params)
134138
# No need to scare folks with my big dumb todo list printing...
135139
"""
136140
TODO:
141+
Python cleanup PR
137142
Bartus help w/ Vitis HLS integration
138-
How to update wiki page?
139-
Make global wires read and written where/when used in pipeline??
140-
Structs and const ref funcs break easy implementation of
141-
'first func use' detection for the entire global var at oince
142-
Need to propogate knowledge of read only global across const refs
143-
And other no delay funcs?
144-
volatile to say 'every read is different'
145-
Can help reduce registers for ray tracer like Victor said
146-
Async wires are another volatile++ like case where
147-
pure downstream fo async funcs even with logic delay arent registered in pipeline
143+
--xo mode making packaing tcl?
144+
Fix duplicate instances of FSM funcs
145+
https://github.com/JulianKemmerer/PipelineC/issues/86
146+
Default to one single inst per MAIN thread?
147+
Requires flattening into one module?
148+
Make inst of every func used in thread?
149+
Make all funcs single instance?
150+
Also can fix bug in FSM style extra clock?
151+
Extra clock when just func inside nested loops? ex. GoL count live neighbors
148152
Victor's RT
149153
disable -Whide for less ghdl warnings
150154
Yosyshq article!
151-
Victor is setting up ECP5 hardware and testing basic flow...
152-
nextpnr has better packing now
153-
Still can only make it to ~21MHz at full RT
154-
Try to rework state wire for fewer registers? (read+written where used)
155-
See if reg est log shows it being extra reg'd?
156155
Try verilating whole pipeline?
157-
Frame buffer with 3 clock domains(vga, RAM, compute)
156+
Document INST_ARRAY array vars
157+
Is read/write only style and auto connected to N insts in design
158+
Can build arbiters out of that (simple data,valid, ready, or even AXI)
159+
Does clang does constexpr/temaples well
160+
Try cindex test w/ preprocessing too
161+
Isolate .pipelinec specific files to be parsed by clang in isolation
162+
Simple example could be multiplier func inferring out bits from in...
163+
Ideal example is FIR bit width growth calc
164+
Need abitrary bit width integer type?
165+
Fixed point stuff?
166+
General shared buffer with 3 clock domains(vga, RAM, compute) (RO, the RAM, RW)
158167
Valid ready signalling
159-
Need N<->M sharing from below too?
160-
Needs async start and complete from below?
168+
Use N<->M sharing
161169
Use to make double frame buffer fast ~final GOL demo?
170+
Needs async start and complete from below?
162171
How to do async start and complete (prob w/ ready+valid for cpu polling)
163-
How to share N instances among M users?
164-
Use frame buffer ram as demo?
165-
If RAM then best to use AXI? :-/
166-
Allow setting slower FSM style arb styles with more regs, not zero cycle
167-
Fix duplicate instances of FSM funcs (is git issue)
172+
Redo FSM Style arb with N-M sharing wires
173+
User will need to specify number of users of SINGLE INST (inst array size)
174+
Allow setting slower FSM style arb styles with more regs, not zero cycle
168175
How to use delay caches?
169176
Path delay cache
170177
Redo attempt at adding up delays in netlist?
@@ -175,12 +182,10 @@ notypes+bartus talk about autopipelined feedback/stall/clock enable
175182
how to better handle clock enables / ready feedback? / resets even
176183
try to use example axis demo thing pipeline contorl signals.c
177184
Victor's other secret thing
178-
Hope I can remember details because sure can put anything here
185+
Hope I can remember details because sure cant put anything here
179186
Make VHDL IEEE committee suggestion for record to-from slv like verilog packing
180187
BUGS: Missing typedef support!?
181188
Shouldnt be too bad, can hack around
182-
Fix bug in FSM style extra clock
183-
Extra clock when just func inside nested loops? ex. GoL count live neighbors
184189
Victor wave pipelining small example
185190
Measure min and max delays
186191
(also applies to existing pipelined circuit not just comb logic)

0 commit comments

Comments
 (0)