@@ -475,7 +475,7 @@ def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decod
475
475
for i in range (batch_size ):
476
476
idx = i
477
477
self .share_inputs ["input_ids" ][idx : idx + 1 , :input_length ] = np .array ([5 ] * input_length )
478
- self .share_inputs ["prompt_ids" ][idx : idx + 1 , :input_length ] = np .array ([5 ] * input_length )
478
+ # self.share_inputs["prompt_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length)
479
479
self .share_inputs ["eos_token_id" ][:] = np .array ([2 ], dtype = "int64" ).reshape (- 1 , 1 )
480
480
self .share_inputs ["seq_lens_this_time" ][idx : idx + 1 ] = input_length
481
481
self .share_inputs ["step_seq_lens_encoder" ][idx : idx + 1 ] = input_length
@@ -794,8 +794,16 @@ def initialize_forward_meta(self):
794
794
# Update Batch type for cuda graph
795
795
# TODO(gongshaotian): Use seq_lens_encoder to set is_decode_batch
796
796
is_decode_batch = not ((self .share_inputs ["seq_lens_this_time" ] > 1 ).sum () > 0 )
797
+
798
+ # mix ep in single node
799
+ if self .fd_config .parallel_config .use_ep and self .fd_config .parallel_config .splitwise_role == "mixed" :
800
+ is_decode_batch_list = []
801
+ paddle .distributed .all_gather_object (is_decode_batch_list , is_decode_batch )
802
+ is_decode_batch = all (is_decode_batch_list )
803
+ self .fd_config .parallel_config .moe_phase .phase = "decode" if is_decode_batch else "prefill"
804
+
797
805
self .forward_meta .step_use_cudagraph = self .use_cudagraph and is_decode_batch
798
-
806
+
799
807
# Initialzie attention meta data
800
808
for attn_backend in self .attn_backends :
801
809
attn_backend .init_attention_metadata (self .forward_meta )
@@ -1163,15 +1171,10 @@ class at the server level, which is too granular for ModelRunner.
1163
1171
We plan to replace it with 'ModelForwardBatch'.
1164
1172
intermediate_tensors:
1165
1173
"""
1166
- is_decode_batch = not ((self .share_inputs ["seq_lens_this_time" ]
1167
- > 1 ).sum () > 0 )
1168
-
1169
- # mix ep in single node
1170
- if self .fd_config .parallel_config .use_ep and self .fd_config .parallel_config .splitwise_role == "mixed" :
1171
- is_decode_batch_list = []
1172
- paddle .distributed .all_gather_object (is_decode_batch_list , is_decode_batch )
1173
- is_decode_batch = all (is_decode_batch_list )
1174
- self .fd_config .parallel_config .moe_phase .phase = "decode" if is_decode_batch else "prefill"
1174
+ # 1. Prepare inputs of model and sampler.
1175
+ skip_idx_list = self ._get_skip_idx (model_forward_batch )
1176
+ self ._prepare_inputs ()
1177
+ self .sampler .pre_process (skip_idx_list )
1175
1178
1176
1179
# NOTE(wufeisheng): If `not_need_stop`` is False, it means the current worker is in an idle state.
1177
1180
# This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode,
@@ -1180,11 +1183,6 @@ class at the server level, which is too granular for ModelRunner.
1180
1183
self ._execute_empty_input ()
1181
1184
return None
1182
1185
1183
- # 1. Prepare inputs of model and sampler.
1184
- skip_idx_list = self ._get_skip_idx (model_forward_batch )
1185
- self ._prepare_inputs ()
1186
- self .sampler .pre_process (skip_idx_list )
1187
-
1188
1186
# 2. Padding inputs for cuda graph
1189
1187
self .padding_cudagraph_inputs ()
1190
1188
0 commit comments