37
37
# Welcome to the land of magic numbers
38
38
# "But I think its much worse than you feared" Modest Mouse - I'm Still Here
39
39
40
- #WTF?
40
+ # Artix 7 100t full RT:
41
+ # Coarse sweep BEST IS sliced 494 clocks, get 490 clock pipeline at 153MHz
42
+ # Coarse (not sweep) gets to 504 slice, 500 clk, 10 clks off fine for now...
43
+ # With BEST_GUESS_MUL_MAX = 10.0, COARSE_SWEEP_MULT_MAX = 1.5, not skipping higher fmax to compensate
41
44
# RT pixel_logic
42
45
# HIER_SWEEP_MULT_MIN latency # top runs
43
46
# 0.125 470 32
57
60
# 2.0 585 9
58
61
# 2.25 493 8
59
62
# 2.5 509 7
63
+ #=== After Recent updates
64
+ # 2.0? 536 5
65
+ # 1.9375 343 8
66
+ # 1.5 368 8
67
+ # 1.0 417 15
68
+ # 0.5 ~417 ~45 reaches 1.0x start after ~33runs
69
+ # 0.0 ~417 ~65 reaches 1.0x after ~55 runs
70
+
71
+ # ECP5 Full RT:
72
+ # Coarse sweep gets to: 75 clocks
73
+ # Coarse (not sweep) gets to 81 clocks
74
+ # HIER_SWEEP_MULT_MIN latency # top runs
75
+ # 2.0 80+clks never finished ~24hrs hash=5c32
76
+ # 1.9375 ^same
77
+ # 1.5 83+clks never finished ~12 hours hash=a39f
78
+ # 1.0 63+clks never finished ~12 hrs hash=e748
79
+ # 0.5 45+clks never finished ~12 hr hash=22ef
80
+ # 0.25 60+clks never finished 12+hours hash=5237
81
+ # 0.181 79+clks never finished 12+hours hash=222f
82
+ # 0.125 ^same
83
+ # 0.09375 ^same
84
+ # 0.078125 \/same
85
+ # 0.06780026720106881
86
+ # 0.0625 66 4 ^ ends up at slighty higher after failing coarse
87
+ # 0.046875 ^same
88
+ # 0.03125 \/same
89
+ # 0.0 70 2 # wow
90
+
91
+ # Tool increases HIER_SWEEP_MULT_MIN if it has trouble
92
+ # i.e. tries to pipeline smaller/less logic modules, easier task
93
+ # Starting off too large is bad - no way to recover
60
94
61
- MAX_N_WORSE_RESULTS_MULT = 16 # Multiplier for how many times failing to improve before moving on? divided by total latnecy
62
- BEST_GUESS_MUL_MAX = 10.0 # Multiplier limit on top down register insertion coarsly during middle out sweep
63
- MAX_ALLOWED_LATENCY_MULT = (
64
- 10 # Multiplier limit for individual module coarse register insertion coarsely, similar same as BEST_GUESS_MUL_MAX?
65
- )
66
95
# Target pipelining at minimum at modules of period = target/MULT
96
+ # 0.0 -> start w/ largest/top level modules first
67
97
# 0.5 -> 2 times the target period (not meeting timing),
68
98
# 2.0 -> 1/2 the target period (easily meets timing)
69
- HIER_SWEEP_MULT_MIN = (
70
- 1.9375
71
- )
99
+ HIER_SWEEP_MULT_MIN = None # Cmd line arg sets
72
100
HIER_SWEEP_MULT_INC = (
73
- 0.001 # Intentionally very small, sweep tries to make largest possible steps
101
+ 0.001 # Intentionally very small, sweep already tries to make largest possible steps
102
+ )
103
+ MAX_N_WORSE_RESULTS_MULT = 16 # Multiplier for how many times failing to improve before moving on? divided by total latnecy
104
+ BEST_GUESS_MUL_MAX = 5.0 # Multiplier limit on top down register insertion coarsly during middle out sweep
105
+ MAX_ALLOWED_LATENCY_MULT = (
106
+ 5.0 # Multiplier limit for individual module coarse register insertion coarsely, similar same as BEST_GUESS_MUL_MAX?
74
107
)
75
108
COARSE_SWEEP_MULT_INC = 0.1 # Multiplier increment for how many times fmax to try for compensating not meeting timing
76
- COARSE_SWEEP_MULT_MAX = 1.5 # Max multiplier for internal fmax
109
+ COARSE_SWEEP_MULT_MAX = 1.0 # ==1.0 disables trying to pipeline to fake higher fmax, Max multiplier for internal fmax
77
110
MAX_CLK_INC_RATIO = 1.25 # Multiplier for how any extra clocks can be added ex. 1.25 means 25% more stages max
78
111
DELAY_UNIT_MULT = 10.0 # Timing is reported in nanoseconds. Multiplier to convert that time into integer units (nanosecs, tenths, hundreds of nanosecs)
79
112
INF_MHZ = 1000 # Impossible timing goal
@@ -2107,8 +2140,9 @@ class InstSweepState:
2107
2140
def __init__ (self ):
2108
2141
self .met_timing = False
2109
2142
self .timing_report = None # Current timing report with multiple paths
2110
- self .mhz_to_latency = dict () # dict[mhz] = latency
2111
- self .latency_to_mhz = dict () # dict[latency] = mhz
2143
+ self .mhz_to_latency = dict () # BEST dict[mhz] = latency
2144
+ self .latency_to_mhz = dict () # BEST dict[latency] = mhz
2145
+ self .last_mhz = None
2112
2146
2113
2147
# Coarse grain sweep
2114
2148
self .coarse_latency = None
@@ -3377,6 +3411,8 @@ def hier_sweep_mult_func(target_path_delay_ns, func_path_delay_ns):
3377
3411
better_mhz = curr_mhz > best_mhz_so_far
3378
3412
better_latency = curr_mhz > best_mhz_this_latency
3379
3413
# Log result
3414
+ got_same_mhz_again = curr_mhz == sweep_state .inst_sweep_state [main_inst ].last_mhz
3415
+ sweep_state .inst_sweep_state [main_inst ].last_mhz = curr_mhz
3380
3416
if better_mhz or better_latency :
3381
3417
sweep_state .inst_sweep_state [main_inst ].mhz_to_latency [
3382
3418
curr_mhz
@@ -3403,19 +3439,65 @@ def hier_sweep_mult_func(target_path_delay_ns, func_path_delay_ns):
3403
3439
].best_guess_sweep_mult
3404
3440
* latency_mult
3405
3441
)
3406
- if (
3442
+
3443
+ # TODO: Very much need to organize this
3444
+
3445
+ # Getting exactly same MHz is bad sign, that didnt pipeline current modules right
3446
+ if got_same_mhz_again :
3447
+ print ("Got identical timing result, trying to pipeline smaller modules instead..." )
3448
+ # Dont compensate with higher fmax, start with original coarse grain compensation on smaller modules
3449
+ # WTF float stuff end up with slice getting repeatedly set just close enough not to slice next level down ?
3450
+ if (
3451
+ sweep_state .inst_sweep_state [
3452
+ main_func
3453
+ ].smallest_not_sliced_hier_mult
3454
+ == sweep_state .inst_sweep_state [
3455
+ main_func
3456
+ ].hier_sweep_mult
3457
+ ):
3458
+ sweep_state .inst_sweep_state [
3459
+ main_func
3460
+ ].hier_sweep_mult += HIER_SWEEP_MULT_INC
3461
+ print (
3462
+ "Nudging hierarchy sweep multiplier:" ,
3463
+ sweep_state .inst_sweep_state [
3464
+ main_func
3465
+ ].hier_sweep_mult ,
3466
+ )
3467
+ else :
3468
+ sweep_state .inst_sweep_state [
3469
+ main_inst
3470
+ ].hier_sweep_mult = sweep_state .inst_sweep_state [
3471
+ main_inst
3472
+ ].smallest_not_sliced_hier_mult
3473
+ print (
3474
+ "Hierarchy sweep multiplier:" ,
3475
+ sweep_state .inst_sweep_state [
3476
+ main_inst
3477
+ ].hier_sweep_mult ,
3478
+ )
3479
+ sweep_state .inst_sweep_state [
3480
+ main_inst
3481
+ ].best_guess_sweep_mult = 1.0
3482
+ sweep_state .inst_sweep_state [
3483
+ main_inst
3484
+ ].coarse_sweep_mult = 1.0
3485
+ made_adj = True
3486
+ elif (
3407
3487
new_best_guess_sweep_mult > BEST_GUESS_MUL_MAX
3408
3488
): # 15 like? main_max_allowed_latency_mult 2.0 magic?
3409
3489
# Fail here, increment sweep mut and try_to_slice logic will slice lower module next time
3410
3490
print (
3411
- "Middle sweep at this hierarchy level failed to meet timing, trying to pipeline current modules to higher fmax to compensate..."
3491
+ "Middle sweep at this hierarchy level failed to meet timing..." ,
3492
+ "Next best guess multiplier was" , new_best_guess_sweep_mult
3412
3493
)
3413
3494
if (
3414
3495
sweep_state .inst_sweep_state [
3415
3496
main_inst
3416
3497
].coarse_sweep_mult
3417
3498
+ COARSE_SWEEP_MULT_INC
3418
- ) <= COARSE_SWEEP_MULT_MAX : # 1.5: # MAGIC?
3499
+ ) <= COARSE_SWEEP_MULT_MAX :
3500
+ print ("Trying to pipeline current modules to higher fmax to compensate..." )
3419
3501
sweep_state .inst_sweep_state [
3420
3502
main_inst
3421
3503
].best_guess_sweep_mult = 1.0
@@ -3766,6 +3848,7 @@ def DO_COARSE_THROUGHPUT_SWEEP(
3766
3848
print (
3767
3849
logic .func_name ,
3768
3850
"reached maximum allowed latency, no more adjustments..." ,
3851
+ "multiplier =" ,max_allowed_latency_mult
3769
3852
)
3770
3853
continue
3771
3854
@@ -3787,7 +3870,8 @@ def DO_COARSE_THROUGHPUT_SWEEP(
3787
3870
and clk_inc >= inst_sweep_state .last_latency_increase
3788
3871
):
3789
3872
# Clip to last inc size - 1, minus one to always be narrowing down
3790
- clk_inc = inst_sweep_state .last_latency_increase - 1
3873
+ # Extra div by 2 helps?
3874
+ clk_inc = int (inst_sweep_state .last_latency_increase / 2 ) # - 1
3791
3875
if clk_inc <= 0 :
3792
3876
clk_inc = 1
3793
3877
clks = inst_sweep_state .coarse_latency + clk_inc
@@ -4723,4 +4807,4 @@ def GET_VHDL_FILES_TCL_TEXT_AND_TOP(
4723
4807
# Use next insts as current
4724
4808
inst_names = set (next_inst_names )
4725
4809
4726
- return files_txt , top_entity_name
4810
+ return files_txt , top_entity_name
0 commit comments