-
Notifications
You must be signed in to change notification settings - Fork 2.7k
Add Qwen3-30B-A3B-Thinking-2507 support on AMD GPUs. #9456
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
458338a
965ac51
8e74fa6
89f06c0
cb35bb2
eb6f76e
1ff271d
7c1726d
724f566
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,13 +49,15 @@ | |
elif _is_cpu and _is_cpu_amx_available: | ||
pass | ||
elif _is_hip: | ||
from vllm import _custom_ops as vllm_ops # gelu_and_mul, silu_and_mul | ||
from sgl_kernel import gelu_and_mul, silu_and_mul | ||
|
||
if _use_aiter: | ||
try: | ||
from aiter import moe_sum | ||
except ImportError: | ||
raise ImportError("aiter is required when SGLANG_USE_AITER is set to True") | ||
else: | ||
from vllm import _custom_ops as vllm_ops | ||
|
||
|
||
if _is_cuda or _is_hip: | ||
|
@@ -1537,7 +1539,7 @@ def fused_experts_impl( | |
gemm1_alpha, | ||
gemm1_limit, | ||
) | ||
elif _is_cuda: | ||
elif _is_cuda or _is_hip: | ||
silu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2) | ||
else: | ||
vllm_ops.silu_and_mul( | ||
|
@@ -1546,7 +1548,7 @@ def fused_experts_impl( | |
elif activation == "gelu": | ||
assert gemm1_alpha is None, "gemm1_alpha is not supported for gelu" | ||
assert gemm1_limit is None, "gemm1_limit is not supported for gelu" | ||
if _is_cuda: | ||
if _is_cuda or _is_hip: | ||
gelu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2) | ||
else: | ||
vllm_ops.gelu_and_mul( | ||
|
@@ -1619,10 +1621,19 @@ def fused_experts_impl( | |
out_hidden_states[begin_chunk_idx:end_chunk_idx], | ||
) | ||
else: | ||
vllm_ops.moe_sum( | ||
intermediate_cache3.view(*intermediate_cache3.shape), | ||
out_hidden_states[begin_chunk_idx:end_chunk_idx], | ||
) | ||
# According to micro benchmark results, torch.compile can get better performance for small token. | ||
if tokens_in_chunk <= 32: | ||
moe_sum_reduce_torch_compile( | ||
intermediate_cache3.view(*intermediate_cache3.shape), | ||
out_hidden_states[begin_chunk_idx:end_chunk_idx], | ||
routed_scaling_factor, | ||
) | ||
else: | ||
moe_sum_reduce_triton( | ||
intermediate_cache3.view(*intermediate_cache3.shape), | ||
out_hidden_states[begin_chunk_idx:end_chunk_idx], | ||
routed_scaling_factor, | ||
) | ||
Comment on lines
+1624
to
+1636
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This block of code for HIP is nearly identical to the logic for CUDA in lines 1602-1614. This code duplication can be avoided by refactoring and combining the logic for both |
||
else: | ||
vllm_ops.moe_sum( | ||
intermediate_cache3.view(*intermediate_cache3.shape), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is another
gelu_and_mul
can be imported fromsgl_kernel
when_is_hip ==True
.