Add hier_mult cmd line arg for setting hierarchy multiplier used in default pipelining iterations. Find optimal hierarchy multiplier value for RT on ECP5. Detect identical unchanged timing results as bad and do something different. Try to tune other pipelining magic numbers, disabling submodule attempts at higher fmax compensation, coarse sweep latency increase jump size decreased by half. Always use nextpnr default router.

JulianKemmerer · JulianKemmerer · commit 7a23daeae011 · 2023-01-16T13:32:33.000-05:00
diff --git a/src/OPEN_TOOLS.py b/src/OPEN_TOOLS.py
@@ -370,11 +370,12 @@ def SYN_AND_REPORT_TIMING_NEW(
         m_ghdl = ""
         if not GHDL_PLUGIN_BUILT_IN:
             m_ghdl = "-m ghdl "
-        optional_router2 = "--router router2"
-        if inst_name:
-            # Dont use router two for small single instances
-            # Only use router two for multi main top level no inst_name
-            optional_router2 = ""
+        optional_router2 = "" # Always default router for now...
+        #optional_router2 = "--router router2"
+        #if inst_name:
+        #    # Dont use router two for small single instances
+        #    # Only use router two for multi main top level no inst_name
+        #    optional_router2 = ""
         sh_file = top_entity_name + ".sh"
         sh_path = output_directory + "/" + sh_file
         f = open(sh_path, "w")
diff --git a/src/SYN.py b/src/SYN.py
@@ -37,7 +37,10 @@
 # Welcome to the land of magic numbers
 #   "But I think its much worse than you feared" Modest Mouse - I'm Still Here
 
-#WTF?
+# Artix 7 100t full RT:
+# Coarse sweep BEST IS sliced 494 clocks, get 490 clock pipeline at 153MHz
+# Coarse (not sweep) gets to 504 slice, 500 clk, 10 clks off fine for now...
+# With BEST_GUESS_MUL_MAX = 10.0, COARSE_SWEEP_MULT_MAX = 1.5, not skipping higher fmax to compensate
 # RT pixel_logic
 # HIER_SWEEP_MULT_MIN   latency   # top runs
 # 0.125                 470       32
@@ -57,23 +60,53 @@
 # 2.0                   585       9
 # 2.25                  493       8
 # 2.5                   509       7
+#=== After Recent updates
+# 2.0?                  536       5
+# 1.9375                343       8
+# 1.5                   368       8
+# 1.0                   417       15
+# 0.5                   ~417      ~45 reaches 1.0x start after ~33runs
+# 0.0                   ~417      ~65 reaches 1.0x after ~55 runs
+
+# ECP5 Full RT:
+# Coarse sweep gets to: 75 clocks
+# Coarse (not sweep) gets to 81 clocks
+# HIER_SWEEP_MULT_MIN   latency         # top runs
+# 2.0                   80+clks never finished ~24hrs hash=5c32
+# 1.9375                ^same
+# 1.5                   83+clks never finished ~12 hours hash=a39f
+# 1.0                   63+clks never finished ~12 hrs hash=e748
+# 0.5                   45+clks never finished ~12 hr hash=22ef
+# 0.25                  60+clks never finished 12+hours hash=5237
+# 0.181                 79+clks never finished 12+hours hash=222f
+# 0.125                 ^same
+# 0.09375               ^same
+# 0.078125              \/same
+# 0.06780026720106881
+# 0.0625                66              4   ^ ends up at slighty higher after failing coarse
+# 0.046875              ^same
+# 0.03125               \/same
+# 0.0                   70              2   # wow
+
+# Tool increases HIER_SWEEP_MULT_MIN if it has trouble
+# i.e. tries to pipeline smaller/less logic modules, easier task
+# Starting off too large is bad - no way to recover
 
-MAX_N_WORSE_RESULTS_MULT = 16  # Multiplier for how many times failing to improve before moving on? divided by total latnecy
-BEST_GUESS_MUL_MAX = 10.0  # Multiplier limit on top down register insertion coarsly during middle out sweep
-MAX_ALLOWED_LATENCY_MULT = (
-    10  # Multiplier limit for individual module coarse register insertion coarsely, similar same as BEST_GUESS_MUL_MAX?
-)
 # Target pipelining at minimum at modules of period = target/MULT
+#   0.0 -> start w/ largest/top level modules first
 #   0.5 -> 2 times the target period (not meeting timing), 
 #   2.0 -> 1/2 the target period (easily meets timing)
-HIER_SWEEP_MULT_MIN = (
-    1.9375  
-)
+HIER_SWEEP_MULT_MIN = None # Cmd line arg sets
 HIER_SWEEP_MULT_INC = (
-    0.001  # Intentionally very small, sweep tries to make largest possible steps
+    0.001  # Intentionally very small, sweep already tries to make largest possible steps
+)
+MAX_N_WORSE_RESULTS_MULT = 16  # Multiplier for how many times failing to improve before moving on? divided by total latnecy
+BEST_GUESS_MUL_MAX = 5.0  # Multiplier limit on top down register insertion coarsly during middle out sweep
+MAX_ALLOWED_LATENCY_MULT = (
+    5.0  # Multiplier limit for individual module coarse register insertion coarsely, similar same as BEST_GUESS_MUL_MAX?
 )
 COARSE_SWEEP_MULT_INC = 0.1  # Multiplier increment for how many times fmax to try for compensating not meeting timing
-COARSE_SWEEP_MULT_MAX = 1.5  # Max multiplier for internal fmax
+COARSE_SWEEP_MULT_MAX = 1.0  # ==1.0 disables trying to pipeline to fake higher fmax, Max multiplier for internal fmax
 MAX_CLK_INC_RATIO = 1.25  # Multiplier for how any extra clocks can be added ex. 1.25 means 25% more stages max
 DELAY_UNIT_MULT = 10.0  # Timing is reported in nanoseconds. Multiplier to convert that time into integer units (nanosecs, tenths, hundreds of nanosecs)
 INF_MHZ = 1000  # Impossible timing goal
@@ -2107,8 +2140,9 @@ class InstSweepState:
     def __init__(self):
         self.met_timing = False
         self.timing_report = None  # Current timing report with multiple paths
-        self.mhz_to_latency = dict()  # dict[mhz] = latency
-        self.latency_to_mhz = dict()  # dict[latency] = mhz
+        self.mhz_to_latency = dict()  # BEST dict[mhz] = latency
+        self.latency_to_mhz = dict()  # BEST dict[latency] = mhz
+        self.last_mhz = None
 
         # Coarse grain sweep
         self.coarse_latency = None
@@ -3377,6 +3411,8 @@ def hier_sweep_mult_func(target_path_delay_ns, func_path_delay_ns):
                 better_mhz = curr_mhz > best_mhz_so_far
                 better_latency = curr_mhz > best_mhz_this_latency
                 # Log result
+                got_same_mhz_again = curr_mhz == sweep_state.inst_sweep_state[main_inst].last_mhz
+                sweep_state.inst_sweep_state[main_inst].last_mhz = curr_mhz
                 if better_mhz or better_latency:
                     sweep_state.inst_sweep_state[main_inst].mhz_to_latency[
                         curr_mhz
@@ -3403,19 +3439,65 @@ def hier_sweep_mult_func(target_path_delay_ns, func_path_delay_ns):
                             ].best_guess_sweep_mult
                             * latency_mult
                         )
-                        if (
+
+                        # TODO: Very much need to organize this
+
+                        # Getting exactly same MHz is bad sign, that didnt pipeline current modules right
+                        if got_same_mhz_again:
+                            print("Got identical timing result, trying to pipeline smaller modules instead...")
+                            # Dont compensate with higher fmax, start with original coarse grain compensation on smaller modules
+                            # WTF float stuff end up with slice getting repeatedly set just close enough not to slice next level down ?
+                            if (
+                                sweep_state.inst_sweep_state[
+                                    main_func
+                                ].smallest_not_sliced_hier_mult
+                                == sweep_state.inst_sweep_state[
+                                    main_func
+                                ].hier_sweep_mult
+                            ):
+                                sweep_state.inst_sweep_state[
+                                    main_func
+                                ].hier_sweep_mult += HIER_SWEEP_MULT_INC
+                                print(
+                                    "Nudging hierarchy sweep multiplier:",
+                                    sweep_state.inst_sweep_state[
+                                        main_func
+                                    ].hier_sweep_mult,
+                                )
+                            else:
+                                sweep_state.inst_sweep_state[
+                                    main_inst
+                                ].hier_sweep_mult = sweep_state.inst_sweep_state[
+                                    main_inst
+                                ].smallest_not_sliced_hier_mult
+                                print(
+                                    "Hierarchy sweep multiplier:",
+                                    sweep_state.inst_sweep_state[
+                                        main_inst
+                                    ].hier_sweep_mult,
+                                )
+                            sweep_state.inst_sweep_state[
+                                main_inst
+                            ].best_guess_sweep_mult = 1.0
+                            sweep_state.inst_sweep_state[
+                                main_inst
+                            ].coarse_sweep_mult = 1.0
+                            made_adj = True
+                        elif (
                             new_best_guess_sweep_mult > BEST_GUESS_MUL_MAX
                         ):  # 15 like? main_max_allowed_latency_mult  2.0 magic?
                             # Fail here, increment sweep mut and try_to_slice logic will slice lower module next time
                             print(
-                                "Middle sweep at this hierarchy level failed to meet timing, trying to pipeline current modules to higher fmax to compensate..."
+                                "Middle sweep at this hierarchy level failed to meet timing...",
+                                "Next best guess multiplier was", new_best_guess_sweep_mult
                             )
                             if (
                                 sweep_state.inst_sweep_state[
                                     main_inst
                                 ].coarse_sweep_mult
                                 + COARSE_SWEEP_MULT_INC
-                            ) <= COARSE_SWEEP_MULT_MAX:  # 1.5: # MAGIC?
+                            ) <= COARSE_SWEEP_MULT_MAX: 
+                                print("Trying to pipeline current modules to higher fmax to compensate...")
                                 sweep_state.inst_sweep_state[
                                     main_inst
                                 ].best_guess_sweep_mult = 1.0
@@ -3766,6 +3848,7 @@ def DO_COARSE_THROUGHPUT_SWEEP(
                                 print(
                                     logic.func_name,
                                     "reached maximum allowed latency, no more adjustments...",
+                                    "multiplier =",max_allowed_latency_mult
                                 )
                                 continue
 
@@ -3787,7 +3870,8 @@ def DO_COARSE_THROUGHPUT_SWEEP(
                             and clk_inc >= inst_sweep_state.last_latency_increase
                         ):
                             # Clip to last inc size - 1, minus one to always be narrowing down
-                            clk_inc = inst_sweep_state.last_latency_increase - 1
+                            # Extra div by 2 helps?
+                            clk_inc = int(inst_sweep_state.last_latency_increase/2) # - 1
                             if clk_inc <= 0:
                                 clk_inc = 1
                             clks = inst_sweep_state.coarse_latency + clk_inc
@@ -4723,4 +4807,4 @@ def GET_VHDL_FILES_TCL_TEXT_AND_TOP(
         # Use next insts as current
         inst_names = set(next_inst_names)
 
-    return files_txt, top_entity_name
+    return files_txt, top_entity_name
diff --git a/src/pipelinec b/src/pipelinec
@@ -39,8 +39,10 @@ parser.add_argument("--modelsim", help="Setup simulation files for use with Mode
 parser.add_argument("--cxxrtl", help="Setup simulation files for use with GHDL+Yosys+CXXRTL.", action="store_true")
 parser.add_argument("--verilator", help="Setup simulation files for use with GHDL+Yosys+Verilator.", action="store_true")
 parser.add_argument("--main_cpp", type=str, default=None,help=f"Specify an existing main C++ file for simulators such as CXXRTL and Verilator.")
+parser.add_argument("--hier_mult", type=float, default=0.0,
+                      help="Hierarchy sweep multiplier minimum starting value ~= 'what modules to focus pipelining efforts on to get to desired fmax'. 0.0 = starts with largest/slowest modules (default). 0.5 = modules half as fast fmax. 2.0 = modules twice as fast as fmax, etc.")
 parser.add_argument("--coarse",
-                      help="Do only a single coarse grain autopipelining sweep (as opposed to default of several multi-level coarse sweeps).",
+                      help="Do only a single coarse grain autopipelining sweep (as opposed to default of hierarchical several multi-level coarse sweeps).",
                       action="store_true")
 parser.add_argument("--start", type=int, default=None, help="When doing a --coarse sweep, start from START cycles latency.")
 parser.add_argument("--stop", type=int, default=None, help="When doing a --coarse sweep, stop at STOP cycles latency.")
@@ -64,6 +66,8 @@ c_file = os.path.abspath(args.c_file)
 SYN.TOP_LEVEL_MODULE = args.top
 # Flag for verilog conversion
 SYN.CONVERT_FINAL_TOP_VERILOG = args.verilog
+# Hierarchy sweep const
+SYN.HIER_SWEEP_MULT_MIN = args.hier_mult
 # Output dir:
 #   Use what user provided
 #   Or create a new dir for each run by default
@@ -134,37 +138,40 @@ SIM.DO_OPTIONAL_SIM(args.sim, parser_state, args, multimain_timing_params)
 # No need to scare folks with my big dumb todo list printing...
 """
 TODO:
+Python cleanup PR
 Bartus help w/ Vitis HLS integration
-  How to update wiki page?
-Make global wires read and written where/when used in pipeline??
-  Structs and const ref funcs break easy implementation of
-    'first func use' detection for the entire global var at oince
-  Need to propogate knowledge of read only global across const refs
-    And other no delay funcs?
-  volatile to say 'every read is different'
-  Can help reduce registers for ray tracer like Victor said
-    Async wires are another volatile++ like case where
-      pure downstream fo async funcs even with logic delay arent registered in pipeline
+  --xo mode making packaing tcl?
+Fix duplicate instances of FSM funcs
+  https://github.com/JulianKemmerer/PipelineC/issues/86
+  Default to one single inst per MAIN thread?
+  Requires flattening into one module?
+  Make inst of every func used in thread?
+  Make all funcs single instance?
+  Also can fix bug in FSM style extra clock?
+    Extra clock when just func inside nested loops? ex. GoL count live neighbors
 Victor's RT
   disable -Whide for less ghdl warnings
   Yosyshq article!
-  Victor is setting up ECP5 hardware and testing basic flow...
-    nextpnr has better packing now
-    Still can only make it to ~21MHz at full RT
-  Try to rework state wire for fewer registers? (read+written where used)
-    See if reg est log shows it being extra reg'd?
   Try verilating whole pipeline?
-Frame buffer with 3 clock domains(vga, RAM, compute)
+Document INST_ARRAY array vars 
+    Is read/write only style and auto connected to N insts in design
+  Can build arbiters out of that (simple data,valid, ready, or even AXI)
+Does clang does constexpr/temaples well
+  Try cindex test w/ preprocessing too
+  Isolate .pipelinec specific files to be parsed by clang in isolation
+  Simple example could be multiplier func inferring out bits from in...
+  Ideal example is FIR bit width growth calc
+  Need abitrary bit width integer type?
+  Fixed point stuff?
+General shared buffer with 3 clock domains(vga, RAM, compute) (RO, the RAM, RW)
   Valid ready signalling
-  Need N<->M sharing from below too?
-  Needs async start and complete from below?
+  Use N<->M sharing
   Use to make double frame buffer fast ~final GOL demo?
+  Needs async start and complete from below?
 How to do async start and complete (prob w/ ready+valid for cpu polling)
-How to share N instances among M users?
-  Use frame buffer ram as demo?
-  If RAM then best to use AXI? :-/
-Allow setting slower FSM style arb styles with more regs, not zero cycle
-Fix duplicate instances of FSM funcs (is git issue)
+Redo FSM Style arb with N-M sharing wires
+  User will need to specify number of users of SINGLE INST (inst array size)
+  Allow setting slower FSM style arb styles with more regs, not zero cycle
 How to use delay caches?
   Path delay cache
     Redo attempt at adding up delays in netlist?
@@ -175,12 +182,10 @@ notypes+bartus talk about autopipelined feedback/stall/clock enable
   how to better handle clock enables / ready feedback? / resets even
   try to use example axis demo thing pipeline contorl signals.c 
 Victor's other secret thing
-  Hope I can remember details because sure can put anything here
+  Hope I can remember details because sure cant put anything here
 Make VHDL IEEE committee suggestion for record to-from slv like verilog packing
 BUGS: Missing typedef support!?
   Shouldnt be too bad, can hack around 
-Fix bug in FSM style extra clock
-  Extra clock when just func inside nested loops? ex. GoL count live neighbors
 Victor wave pipelining small example
   Measure min and max delays 
     (also applies to existing pipelined circuit not just comb logic)