Skip to content

Add Qwen3-30B-A3B-Thinking-2507 support on AMD GPUs. #9456

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 18 additions & 7 deletions python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,15 @@
elif _is_cpu and _is_cpu_amx_available:
pass
elif _is_hip:
from vllm import _custom_ops as vllm_ops # gelu_and_mul, silu_and_mul
from sgl_kernel import gelu_and_mul, silu_and_mul

if _use_aiter:
try:
from aiter import moe_sum
except ImportError:
raise ImportError("aiter is required when SGLANG_USE_AITER is set to True")
else:
from vllm import _custom_ops as vllm_ops


if _is_cuda or _is_hip:
Expand Down Expand Up @@ -1537,7 +1539,7 @@ def fused_experts_impl(
gemm1_alpha,
gemm1_limit,
)
elif _is_cuda:
elif _is_cuda or _is_hip:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is another gelu_and_mul can be imported from sgl_kernel when _is_hip ==True.

silu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
else:
vllm_ops.silu_and_mul(
Expand All @@ -1546,7 +1548,7 @@ def fused_experts_impl(
elif activation == "gelu":
assert gemm1_alpha is None, "gemm1_alpha is not supported for gelu"
assert gemm1_limit is None, "gemm1_limit is not supported for gelu"
if _is_cuda:
if _is_cuda or _is_hip:
gelu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
else:
vllm_ops.gelu_and_mul(
Expand Down Expand Up @@ -1619,10 +1621,19 @@ def fused_experts_impl(
out_hidden_states[begin_chunk_idx:end_chunk_idx],
)
else:
vllm_ops.moe_sum(
intermediate_cache3.view(*intermediate_cache3.shape),
out_hidden_states[begin_chunk_idx:end_chunk_idx],
)
# According to micro benchmark results, torch.compile can get better performance for small token.
if tokens_in_chunk <= 32:
moe_sum_reduce_torch_compile(
intermediate_cache3.view(*intermediate_cache3.shape),
out_hidden_states[begin_chunk_idx:end_chunk_idx],
routed_scaling_factor,
)
else:
moe_sum_reduce_triton(
intermediate_cache3.view(*intermediate_cache3.shape),
out_hidden_states[begin_chunk_idx:end_chunk_idx],
routed_scaling_factor,
)
Comment on lines +1624 to +1636
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This block of code for HIP is nearly identical to the logic for CUDA in lines 1602-1614. This code duplication can be avoided by refactoring and combining the logic for both _is_cuda and _is_hip, for example by using elif _is_cuda or _is_hip:. This would improve maintainability. Additionally, the CUDA path includes optimizations for topk values of 1 and 2, which could also be beneficial for the HIP path.

else:
vllm_ops.moe_sum(
intermediate_cache3.view(*intermediate_cache3.shape),
Expand Down
Loading
Loading