Skip to content

Commit 34ebf2a

Browse files
fix code style
1 parent da2aa36 commit 34ebf2a

File tree

4 files changed

+31
-22
lines changed

4 files changed

+31
-22
lines changed

fastdeploy/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
import os
2020
from dataclasses import dataclass, field
21-
from enum import Enum
2221
from typing import Literal, Optional
2322

2423
from paddleformers.transformers.configuration_utils import PretrainedConfig
@@ -34,6 +33,7 @@ class MoEPhase:
3433
"""
3534
The generation phase of the moe.
3635
"""
36+
3737
def __init__(self, phase="prefill"):
3838
self._phase = phase
3939

fastdeploy/model_executor/layers/moe/ep.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def __init__(
7474
self.ep_config = Config(24, 6, 256)
7575
self.num_max_dispatch_tokens_per_rank = num_max_dispatch_tokens_per_rank
7676

77-
# In mixed EP mode on a single node, we dynamically switch between
77+
# In mixed EP mode on a single node, we dynamically switch between
7878
# high throughput and low latency modes.
7979
if splitwise_role == "mixed":
8080
# decode engine
@@ -88,7 +88,7 @@ def __init__(
8888
low_latency_mode=False,
8989
num_qps_per_rank=1,
9090
)
91-
# In disaggregated mode on mutiple nodes, we either use
91+
# In disaggregated mode on mutiple nodes, we either use
9292
# high throughput mode or low latency mode.
9393
else:
9494
if moe_phase.phase == "decode":
@@ -105,7 +105,6 @@ def __init__(
105105
else:
106106
raise ValueError(f"Unknown generation phase {moe_phase}")
107107

108-
109108
def get_low_latency_buffer(self):
110109
"""
111110
Get the DeepEP buffer.
@@ -194,7 +193,7 @@ def low_latency_combine(
194193
Return:
195194
combined_hidden_states: [num_tokens, hidden]
196195
"""
197-
# TODO(@wufeisheng): Delete them when deepep in PaddlePaddle is fixed
196+
# TODO(@wufeisheng): Delete them when deepep in PaddlePaddle is fixed
198197
(
199198
src_info,
200199
layout_range,
@@ -208,7 +207,7 @@ def low_latency_combine(
208207
None,
209208
num_experts,
210209
)
211-
210+
212211
combined_hidden_states, _, combine_hook = self.decode_deepep_engine.low_latency_combine(
213212
hidden_states,
214213
topk_idx,

fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@
1919
import paddle
2020
from paddle import nn
2121

22-
from fastdeploy.config import MoEPhase
23-
2422
from ..quantization.quant_base import QuantMethodBase
2523

2624

@@ -46,11 +44,16 @@ def init_ep(self, layer: nn.Layer) -> None:
4644
"""
4745
if layer.ep_size > 1:
4846
if layer.fd_config.parallel_config.splitwise_role == "mixed":
49-
from .ep import EPPrefillRunner, EPDecoderRunner
47+
from .ep import EPDecoderRunner, EPPrefillRunner
48+
5049
self.ep_prefill_runner = EPPrefillRunner(
51-
layer.top_k, layer.hidden_size, layer.num_experts,
50+
layer.top_k,
51+
layer.hidden_size,
52+
layer.num_experts,
5253
layer.fd_config.parallel_config.splitwise_role,
53-
layer.ep_size, layer.ep_rank)
54+
layer.ep_size,
55+
layer.ep_rank,
56+
)
5457
self.ep_decoder_runner = EPDecoderRunner(
5558
layer.top_k,
5659
layer.hidden_size,
@@ -63,17 +66,27 @@ def init_ep(self, layer: nn.Layer) -> None:
6366
else:
6467
if layer.fd_config.parallel_config.moe_phase == "prefill":
6568
from .ep import EPPrefillRunner
69+
6670
self.ep_prefill_runner = EPPrefillRunner(
67-
layer.top_k, layer.hidden_size, layer.num_experts,
71+
layer.top_k,
72+
layer.hidden_size,
73+
layer.num_experts,
6874
layer.fd_config.parallel_config.splitwise_role,
69-
layer.ep_size, layer.ep_rank)
75+
layer.ep_size,
76+
layer.ep_rank,
77+
)
7078
else:
7179
from .ep import EPDecoderRunner
80+
7281
self.ep_decoder_runner = EPDecoderRunner(
73-
layer.top_k, layer.hidden_size, layer.num_experts,
82+
layer.top_k,
83+
layer.hidden_size,
84+
layer.num_experts,
7485
layer.moe_config.num_max_dispatch_tokens_per_rank,
7586
layer.fd_config.parallel_config.splitwise_role,
76-
layer.ep_size, layer.ep_rank)
87+
layer.ep_size,
88+
layer.ep_rank,
89+
)
7790

7891
def process_loaded_weights(self, layer, weights) -> None:
7992
"""
@@ -149,10 +162,7 @@ def apply(
149162
if layer.ep_size > 1:
150163
if layer.fd_config.parallel_config.moe_phase.phase == "prefill":
151164
return self.apply_ep_prefill(layer, x, gate_out)
152-
elif layer.fd_config.parallel_config.moe_phase.phase == "decode":
153-
return self.apply_ep_decode(layer, x, gate_out)
154165
else:
155-
logger.error(
156-
f"invalid value of moe_phase={layer.fd_config.parallel_config.moe_phase.phase}")
166+
return self.apply_ep_decode(layer, x, gate_out)
157167
else:
158168
return self.apply_tp(layer, x, gate_out)

fastdeploy/worker/gpu_model_runner.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -794,16 +794,16 @@ def initialize_forward_meta(self):
794794
# Update Batch type for cuda graph
795795
# TODO(gongshaotian): Use seq_lens_encoder to set is_decode_batch
796796
is_decode_batch = not ((self.share_inputs["seq_lens_this_time"] > 1).sum() > 0)
797-
797+
798798
# mix ep in single node
799799
if self.fd_config.parallel_config.use_ep and self.fd_config.parallel_config.splitwise_role == "mixed":
800800
is_decode_batch_list = []
801801
paddle.distributed.all_gather_object(is_decode_batch_list, is_decode_batch)
802802
is_decode_batch = all(is_decode_batch_list)
803803
self.fd_config.parallel_config.moe_phase.phase = "decode" if is_decode_batch else "prefill"
804-
804+
805805
self.forward_meta.step_use_cudagraph = self.use_cudagraph and is_decode_batch
806-
806+
807807
# Initialzie attention meta data
808808
for attn_backend in self.attn_backends:
809809
attn_backend.init_attention_metadata(self.forward_meta)

0 commit comments

Comments
 (0)