Skip to content

Commit 70ee910

Browse files
authored
[Excutor] Change cudagraph hashkey from batch size to num_tokens (#3454)
1 parent ea4a3b4 commit 70ee910

File tree

2 files changed

+27
-27
lines changed

2 files changed

+27
-27
lines changed

fastdeploy/config.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -487,7 +487,7 @@ def __init__(
487487
self.full_cuda_graph: bool = True
488488

489489
self.max_capture_size: int = None
490-
self.batch_size_to_captured_size: dict[int, int] = None
490+
self.real_shape_to_captured_size: dict[int, int] = None
491491
# CINN Config ...
492492
if args is not None:
493493
for key, value in args.items():
@@ -516,26 +516,26 @@ def init_with_cudagrpah_size(self, max_num_seqs: int = 0) -> None:
516516
self.cudagraph_capture_sizes.sort(reverse=True)
517517
self.max_capture_size = self.cudagraph_capture_sizes[0] if self.cudagraph_capture_sizes else 0
518518

519-
# Pre-compute the mapping from batch size to padded graph size
520-
self.batch_size_to_captured_size = {}
519+
# Pre-compute the mapping from shape to padded graph size
520+
self.real_shape_to_captured_size = {}
521521
for end, start in zip(self.cudagraph_capture_sizes, self.cudagraph_capture_sizes[1:] + [0]):
522522
for bs in range(start, end):
523523
if bs == start:
524-
self.batch_size_to_captured_size[bs] = start
524+
self.real_shape_to_captured_size[bs] = start
525525
else:
526-
self.batch_size_to_captured_size[bs] = end
527-
self.batch_size_to_captured_size[self.max_capture_size] = self.max_capture_size
526+
self.real_shape_to_captured_size[bs] = end
527+
self.real_shape_to_captured_size[self.max_capture_size] = self.max_capture_size
528528

529529
def _set_cudagraph_sizes(self, max_num_seqs: int = 0):
530530
"""
531-
Calculate a series of candidate capture batch sizes,
531+
Calculate a series of candidate capture sizes,
532532
and then extract a portion of them as the capture list for the CUDA graph based on user input.
533533
"""
534-
# Batch Size [1, 2, 4, 8, 16, ... 120, 128]
534+
# Shape [1, 2, 4, 8, 16, ... 120, 128]
535535
draft_capture_sizes = [1, 2, 4] + [8 * i for i in range(1, 17)]
536-
# Batch Size [128, 144, ... 240, 256]
536+
# Shape [128, 144, ... 240, 256]
537537
draft_capture_sizes += [16 * i for i in range(9, 17)]
538-
# Batch Size [256, 288, ... 992, 1024]
538+
# Shape [256, 288, ... 992, 1024]
539539
draft_capture_sizes += [32 * i for i in range(17, 33)]
540540

541541
draft_capture_sizes.append(max_num_seqs)

fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@
2929

3030
@dataclass
3131
class ConcreteSizeEntry:
32-
"""Record the concrete information corresponding to the current batch size"""
32+
"""Record the concrete information corresponding to the current shape(num_tokens)"""
3333

34-
# Concrete batch size
34+
# Concrete shape
3535
runtime_bs: int
3636
# The size is in cudagraph_capture_sizes
3737
use_cudagraph: bool = True
@@ -42,7 +42,7 @@ class ConcreteSizeEntry:
4242
runnable: Callable = None # type: ignore
4343
# Number of completed warmups
4444
num_finished_warmup: int = 0
45-
# Captured cuda graph object corresponding to the current batch size
45+
# Captured cuda graph object corresponding to the current real shape
4646
cuda_graph: Optional[graphs.CUDAGraph] = None
4747
# Output buffer of cudagraph
4848
output_buffer: Optional[paddle.Tensor] = None
@@ -60,33 +60,33 @@ def __init__(
6060
self.runnable = runnable
6161
self.cudagraph_capture_sizes = fd_config.graph_opt_config.cudagraph_capture_sizes
6262
self.warm_up_size = fd_config.graph_opt_config.cudagraph_num_of_warmups
63-
self.batch_size_to_captured_size = fd_config.graph_opt_config.batch_size_to_captured_size
63+
self.real_shape_to_captured_size = fd_config.graph_opt_config.real_shape_to_captured_size
6464

65-
# Runtime batch size -> ConcreteSizeEntry
65+
# Runtime real shape -> ConcreteSizeEntry
6666
self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
6767

6868
for shape in self.cudagraph_capture_sizes:
6969
self.concrete_size_entries[shape] = ConcreteSizeEntry(runtime_bs=shape)
7070

7171
logger.info(
72-
f"[CUDA GRAPH] CUDAGraph capture list {self.cudagraph_capture_sizes}, " "Created all batch sizes entry."
72+
f"[CUDA GRAPH] CUDAGraph capture list {self.cudagraph_capture_sizes}, " "Created all real shape entry."
7373
)
7474

7575
def __call__(self, **kwargs):
76-
# Get batch size
76+
# Get real shape(all num tokens)
7777
ids_remove_padding: paddle.Tensor = kwargs["ids_remove_padding"]
78-
batch_size = ids_remove_padding.shape[0]
79-
padding_batch_size = self.batch_size_to_captured_size[batch_size]
78+
real_shape = ids_remove_padding.shape[0]
79+
padding_real_shape = self.real_shape_to_captured_size[real_shape]
8080
logger.debug(
81-
f"[CUDA GRAPH] The actual batch size obtained by CUDAGraph is :{batch_size}, "
82-
f"The padded batch size is :{padding_batch_size}"
81+
f"[CUDA GRAPH] The actual real shape obtained by CUDAGraph is :{real_shape}, "
82+
f"The padded shape is :{padding_real_shape}"
8383
)
8484

85-
entry = self.concrete_size_entries.get(padding_batch_size)
86-
assert entry is not None, f"Batch size:{padding_batch_size} is not in cuda graph capture list."
85+
entry = self.concrete_size_entries.get(padding_real_shape)
86+
assert entry is not None, f"real shape:{padding_real_shape} is not in cuda graph capture list."
8787
if entry.runnable is None:
8888
entry.runnable = self.runnable
89-
logger.debug(f"[CUDA GRAPH] New entry lazy initialize with batch size {padding_batch_size}")
89+
logger.debug(f"[CUDA GRAPH] New entry lazy initialize with real shape {padding_real_shape}")
9090

9191
if not entry.use_cudagraph:
9292
return entry.runnable(**kwargs)
@@ -98,7 +98,7 @@ def __call__(self, **kwargs):
9898
entry.num_finished_warmup += 1
9999
entry.runnable(**kwargs)
100100
logger.debug(
101-
f"[CUDA GRAPH] Warm up for batch size {padding_batch_size}, "
101+
f"[CUDA GRAPH] Warm up for real shape {padding_real_shape}, "
102102
f"finished ({n + 1}/{entry.num_finished_warmup}) times"
103103
)
104104

@@ -122,9 +122,9 @@ def __call__(self, **kwargs):
122122
output._clear
123123

124124
paddle.device.synchronize()
125-
logger.debug(f"[CUDA GRAPH] CUDAGraph captured for batch size {padding_batch_size}")
125+
logger.debug(f"[CUDA GRAPH] CUDAGraph captured for real shape {padding_real_shape}")
126126

127127
# Replay
128128
entry.cuda_graph.replay()
129-
logger.debug(f"[CUDA GRAPH] CUDAGraph replayed for batch size {padding_batch_size}")
129+
logger.debug(f"[CUDA GRAPH] CUDAGraph replayed for real shape {padding_real_shape}")
130130
return entry.output_buffer

0 commit comments

Comments
 (0)