Add fp4 allgather support

nvcastet · nvcastet · commit 66b3a8cc2014 · 2025-08-20T10:53:07.000-05:00
diff --git a/python/sglang/srt/distributed/parallel_state.py b/python/sglang/srt/distributed/parallel_state.py
@@ -768,9 +768,11 @@ def _all_gather_single(
                 else:
                     output_size = (input_size[0] * world_size,) + input_size[1:]
                 # Allocate output tensor.
-                output_tensor = torch.empty(
-                    output_size, dtype=input_.dtype, device=input_.device
-                )
+                with self.use_symmetric_memory(self, disabled=sizes is not None) as sm:
+                    output_tensor = torch.empty(
+                        output_size, dtype=input_.dtype, device=input_.device
+                    )
+                    sm.tag(output_tensor)
                 pynccl_comm.all_gather(output_tensor, input_, sizes=sizes)
                 return output_tensor
 
diff --git a/python/sglang/srt/layers/moe/topk.py b/python/sglang/srt/layers/moe/topk.py
@@ -31,12 +31,17 @@
 import torch.nn.functional as F
 
 from sglang.srt.custom_op import CustomOp
+from sglang.srt.distributed import get_tp_group
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
 from sglang.srt.eplb import expert_location_dispatch
 from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
 from sglang.srt.eplb.expert_location_dispatch import (
     ExpertLocationDispatchInfo,
     topk_ids_logical_to_physical,
 )
+from sglang.srt.layers.dp_attention import is_max_padding
 from sglang.srt.layers.moe import (
     get_moe_runner_backend,
     should_use_flashinfer_trtllm_moe,
@@ -265,13 +270,19 @@ def forward_cuda(
             )
         else:
             self.topk_config.torch_native = False
-            return select_experts(
-                hidden_states=hidden_states,
-                router_logits=router_logits,
-                topk_config=self.topk_config,
-                num_token_non_padded=num_token_non_padded,
-                expert_location_dispatch_info=expert_location_dispatch_info,
-            )
+            with use_symmetric_memory(
+                get_tp_group(), disabled=not is_max_padding()
+            ) as sm:
+                topk_output = select_experts(
+                    hidden_states=hidden_states,
+                    router_logits=router_logits,
+                    topk_config=self.topk_config,
+                    num_token_non_padded=num_token_non_padded,
+                    expert_location_dispatch_info=expert_location_dispatch_info,
+                )
+                sm.tag(topk_output.topk_weights)
+                sm.tag(topk_output.topk_ids)
+            return topk_output
 
     def forward_cpu(
         self,
@@ -329,8 +340,11 @@ def forward_npu(
 
     def empty_topk_output(self, device: torch.device) -> TopKOutput:
         topk = self.topk_config.top_k - self.topk_config.num_fused_shared_experts
-        topk_weights = torch.empty((0, topk), dtype=torch.float32, device=device)
-        topk_idx = torch.full((0, topk), -1, dtype=torch.int32, device=device)
+        with use_symmetric_memory(get_tp_group(), disabled=not is_max_padding()) as sm:
+            topk_weights = torch.empty((0, topk), dtype=torch.float32, device=device)
+            topk_idx = torch.full((0, topk), -1, dtype=torch.int32, device=device)
+            sm.tag(topk_weights)
+            sm.tag(topk_idx)
         router_logits = torch.empty((0, topk), dtype=torch.float32, device=device)
         return StandardTopKOutput(topk_weights, topk_idx, router_logits)
 
diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -8,7 +8,14 @@
 from torch.nn.parameter import Parameter
 
 from sglang.srt.distributed import get_tp_group
-from sglang.srt.layers.dp_attention import get_dp_global_num_tokens, get_local_dp_buffer
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
+from sglang.srt.layers.dp_attention import (
+    get_dp_global_num_tokens,
+    get_local_dp_buffer,
+    is_max_padding,
+)
 from sglang.srt.layers.moe import (
     should_use_flashinfer_cutlass_moe_fp4_allgather,
     should_use_flashinfer_trtllm_moe,
@@ -1268,22 +1275,35 @@ def apply(
                 from flashinfer import fp4_quantize, nvfp4_block_scale_interleave
 
                 # Quantize before comm, swizzle after.
-                if x.shape[0] > 0:
-                    x, x_sf = fp4_quantize(
-                        x, layer.w13_input_scale_quant, is_sf_swizzled_layout=False
-                    )
-                else:
-                    x_col = x.shape[1]
-                    x = torch.zeros(0, x_col // 2, dtype=torch.uint8, device=x.device)
-                    x_sf = torch.zeros(
-                        0, x_col // 16, dtype=torch.uint8, device=x.device
-                    )
+                with use_symmetric_memory(
+                    get_tp_group(), disabled=not is_max_padding()
+                ) as sm:
+                    if x.shape[0] > 0:
+                        x, x_sf = fp4_quantize(
+                            x, layer.w13_input_scale_quant, is_sf_swizzled_layout=False
+                        )
+                    else:
+                        x_col = x.shape[1]
+                        x = torch.zeros(
+                            0, x_col // 2, dtype=torch.uint8, device=x.device
+                        )
+                        x_sf = torch.zeros(
+                            0, x_col // 16, dtype=torch.uint8, device=x.device
+                        )
+                    sm.tag(x)
+                    sm.tag(x_sf)
                 topk_weights, topk_ids, x, x_sf = get_tp_group().all_gatherv(
                     [topk_weights, topk_ids, x, x_sf], sizes=get_dp_global_num_tokens()
                 )
                 x_sf = nvfp4_block_scale_interleave(x_sf)
 
+            with use_symmetric_memory(
+                get_tp_group(), disabled=not is_max_padding()
+            ) as sm:
+                symm_output = torch.empty_like(x)
+                sm.tag(symm_output)
             output = flashinfer_cutlass_fused_moe(
+                output=symm_output,
                 input=x,
                 token_selected_experts=topk_ids.to(torch.int),
                 token_final_scales=topk_weights,
diff --git a/python/sglang/srt/model_executor/graph_runner.py b/python/sglang/srt/model_executor/graph_runner.py
@@ -638,7 +638,11 @@ def capture_one_batch_size(self, bs: int, forward: Callable):
         def run_once():
             # Clean intermediate result cache for DP attention
             forward_batch.dp_local_start_pos = forward_batch.dp_local_num_tokens = None
-            set_dp_buffer_len(global_dp_buffer_len, num_tokens)
+            set_dp_buffer_len(
+                global_dp_buffer_len,
+                num_tokens,
+                forward_batch.dp_padding_mode.is_max_len(),
+            )
 
             kwargs = {}
             if (
diff --git a/python/sglang/srt/operations.py b/python/sglang/srt/operations.py
@@ -96,8 +96,8 @@ def next(self):
             set_dp_buffer_len(
                 self._global_dp_buffer_len,
                 self._local_dp_buffer_len,
-                self._global_num_tokens,
                 self._is_max_padding,
+                self._global_num_tokens,
             )
 
         with _annotate_region(debug_name=f"{self._debug_name}{self._index}"):

Original file line number	Diff line number	Diff line change
`@@ -96,8 +96,8 @@ def next(self):`
`96`	`96`	`set_dp_buffer_len(`
`97`	`97`	`self._global_dp_buffer_len,`
`98`	`98`	`self._local_dp_buffer_len,`
`99`		`- self._global_num_tokens,`
`100`	`99`	`self._is_max_padding,`
	`100`	`+ self._global_num_tokens,`
`101`	`101`	`)`
`102`	`102`
`103`	`103`	`with _annotate_region(debug_name=f"{self._debug_name}{self._index}"):`