Skip to content

Commit 2f949c6

Browse files
fix conflict
1 parent cc2d123 commit 2f949c6

File tree

3 files changed

+15
-19
lines changed

3 files changed

+15
-19
lines changed

fastdeploy/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ def __init__(
154154
):
155155
self.sequence_parallel = False # Whether to enable sequence parallelism.
156156
self.use_ep = False # Whether to enable Expert Parallelism
157-
self.moe_phase = MoEPhase.PREFILL # Generation phase
157+
self.moe_phase = MoEPhase("prefill") # Generation phase
158158
self.msg_queue_id = 1 # mesage queue id
159159

160160
self.tensor_parallel_rank = 0 # TP rank ID

fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,6 @@ def init_ep(self, layer: nn.Layer) -> None:
7575
layer.fd_config.parallel_config.splitwise_role,
7676
layer.ep_size, layer.ep_rank)
7777

78-
>>>>>>> fc552930 (Support mixed ep)
79-
8078
def process_loaded_weights(self, layer, weights) -> None:
8179
"""
8280
process_loaded_weights

fastdeploy/worker/gpu_model_runner.py

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,7 @@ def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decod
475475
for i in range(batch_size):
476476
idx = i
477477
self.share_inputs["input_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length)
478-
self.share_inputs["prompt_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length)
478+
# self.share_inputs["prompt_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length)
479479
self.share_inputs["eos_token_id"][:] = np.array([2], dtype="int64").reshape(-1, 1)
480480
self.share_inputs["seq_lens_this_time"][idx : idx + 1] = input_length
481481
self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = input_length
@@ -794,8 +794,16 @@ def initialize_forward_meta(self):
794794
# Update Batch type for cuda graph
795795
# TODO(gongshaotian): Use seq_lens_encoder to set is_decode_batch
796796
is_decode_batch = not ((self.share_inputs["seq_lens_this_time"] > 1).sum() > 0)
797+
798+
# mix ep in single node
799+
if self.fd_config.parallel_config.use_ep and self.fd_config.parallel_config.splitwise_role == "mixed":
800+
is_decode_batch_list = []
801+
paddle.distributed.all_gather_object(is_decode_batch_list, is_decode_batch)
802+
is_decode_batch = all(is_decode_batch_list)
803+
self.fd_config.parallel_config.moe_phase.phase = "decode" if is_decode_batch else "prefill"
804+
797805
self.forward_meta.step_use_cudagraph = self.use_cudagraph and is_decode_batch
798-
806+
799807
# Initialzie attention meta data
800808
for attn_backend in self.attn_backends:
801809
attn_backend.init_attention_metadata(self.forward_meta)
@@ -1163,15 +1171,10 @@ class at the server level, which is too granular for ModelRunner.
11631171
We plan to replace it with 'ModelForwardBatch'.
11641172
intermediate_tensors:
11651173
"""
1166-
is_decode_batch = not ((self.share_inputs["seq_lens_this_time"]
1167-
> 1).sum() > 0)
1168-
1169-
# mix ep in single node
1170-
if self.fd_config.parallel_config.use_ep and self.fd_config.parallel_config.splitwise_role == "mixed":
1171-
is_decode_batch_list = []
1172-
paddle.distributed.all_gather_object(is_decode_batch_list, is_decode_batch)
1173-
is_decode_batch = all(is_decode_batch_list)
1174-
self.fd_config.parallel_config.moe_phase.phase = "decode" if is_decode_batch else "prefill"
1174+
# 1. Prepare inputs of model and sampler.
1175+
skip_idx_list = self._get_skip_idx(model_forward_batch)
1176+
self._prepare_inputs()
1177+
self.sampler.pre_process(skip_idx_list)
11751178

11761179
# NOTE(wufeisheng): If `not_need_stop`` is False, it means the current worker is in an idle state.
11771180
# This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode,
@@ -1180,11 +1183,6 @@ class at the server level, which is too granular for ModelRunner.
11801183
self._execute_empty_input()
11811184
return None
11821185

1183-
# 1. Prepare inputs of model and sampler.
1184-
skip_idx_list = self._get_skip_idx(model_forward_batch)
1185-
self._prepare_inputs()
1186-
self.sampler.pre_process(skip_idx_list)
1187-
11881186
# 2. Padding inputs for cuda graph
11891187
self.padding_cudagraph_inputs()
11901188

0 commit comments

Comments
 (0)