is_tensor_stream_capturing instead cudaStreamIsCapturing

zhink · zhink · commit a04be2d1e550 · 2025-08-20T17:17:33.000+08:00
diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -552,6 +552,8 @@ int64_t open_mem_handle(paddle::Tensor& mem_handle);
 
 void free_shared_buffer(int64_t buffer);
 
+bool is_tensor_stream_capturing(paddle::Tensor& input, int64_t _fa);
+
 // speculative decoding Kernel
 std::vector<paddle::Tensor> SpeculateGetPaddingOffset(
     const paddle::Tensor& input_ids,
@@ -1103,6 +1105,8 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
 
   m.def("free_shared_buffer", &free_shared_buffer, "free_shared_buffer");
 
+  m.def("is_tensor_stream_capturing", &is_tensor_stream_capturing, "get tensor stream is in capturing");
+
   m.def("open_mem_handle", &open_mem_handle, "open_mem_handle");
 
   m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta, "get_graph_buffer_ipc_meta");
diff --git a/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cu b/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cu
@@ -163,3 +163,8 @@ fptr_t open_mem_handle(paddle::Tensor& mem_handle) {
 void free_shared_buffer(fptr_t buffer) {
   CUDACHECK(cudaFree(reinterpret_cast<void*>(buffer)));
 }
+
+bool is_tensor_stream_capturing(paddle::Tensor& input, fptr_t _fa) {
+  auto fa = reinterpret_cast<paddle::CustomAllreduce*>(_fa);
+  return fa->is_tensor_stream_capturing(input);
+}
diff --git a/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh b/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh
@@ -441,6 +441,21 @@ class CustomAllreduce {
     graph_unreg_buffers_.clear();
   }
 
+  /**
+   * Paddle GPU Tensor.stream() is cudaStreamCaptureStatusActive.
+   */
+  bool is_tensor_stream_capturing(paddle::Tensor& input)
+  {
+    auto stream = input.stream();
+    cudaStreamCaptureStatus status;
+    CUDACHECK(cudaStreamIsCapturing(stream, &status));
+    if (status == cudaStreamCaptureStatusActive) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
   /**
    * Performs allreduce, assuming input has already been registered.
    *
diff --git a/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py b/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py
@@ -31,6 +31,7 @@
     meta_size,
     register_buffer,
     register_graph_buffers,
+    is_tensor_stream_capturing,
 )
 
 try:
@@ -163,6 +164,15 @@ def all_reduce(
             all_reduce(self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size)
         return out
 
+    def iscapturing(
+        self,
+        input: paddle.Tensor,
+    ):
+        """
+        get tensor stream is in capturing
+        """
+        return is_tensor_stream_capturing(self._ptr, input)
+
     def start_capture(self):
         """
         set CUDA graph flag: True.
@@ -207,11 +217,8 @@ def register_graph_buffers(self):
     def custom_all_reduce(self, input: paddle.Tensor) -> Optional[paddle.Tensor]:
         """The main allreduce API that provides support for cuda graph."""
         if self.capturing:
-            lib = cuda_wrapper.CudaRTLibrary()
-            stream = paddle.device.current_stream()
-            stream_capturing = lib.cudaStreamIsCapturing(stream)
-            if stream_capturing.value == 1:
-                # 1 is cudaStreamCaptureStatusActive: The stream is capturing.
+            if self.iscapturing(input):
+                # The input stream is capturing.
                 return self.all_reduce(input, input, registered=True)
             else:
                 # If warm up, mimic the allocation pattern since custom

Original file line number	Diff line number	Diff line change
`@@ -163,3 +163,8 @@ fptr_t open_mem_handle(paddle::Tensor& mem_handle) {`
`163`	`163`	`void free_shared_buffer(fptr_t buffer) {`
`164`	`164`	`CUDACHECK(cudaFree(reinterpret_cast<void*>(buffer)));`
`165`	`165`	`}`
	`166`	`+`
	`167`	`+bool is_tensor_stream_capturing(paddle::Tensor& input, fptr_t _fa) {`
	`168`	`+ auto fa = reinterpret_cast<paddle::CustomAllreduce*>(_fa);`
	`169`	`+ return fa->is_tensor_stream_capturing(input);`
	`170`	`+}`