Skip to content

Commit 8a82866

Browse files
liz-badadajychen21
andauthored
Add DeepEP to CI PR Test (#5655)
Co-authored-by: Jinyan Chen <jinyanc@nvidia.com>
1 parent aff584f commit 8a82866

10 files changed

+1607
-3
lines changed

.github/workflows/pr-test.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ jobs:
9797

9898
- name: Install dependencies
9999
run: |
100-
bash scripts/ci_install_dependency.sh
100+
bash scripts/ci_install_dependency_8_gpu.sh
101101
102102
- name: Run test
103103
timeout-minutes: 40
@@ -259,9 +259,9 @@ jobs:
259259
finish:
260260
if: always()
261261
needs: [
262-
unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu,
262+
unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu, unit-test-backend-8-gpu,
263263
performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu,
264-
accuracy-test-1-gpu, accuracy-test-2-gpu
264+
accuracy-test-1-gpu, accuracy-test-2-gpu,
265265
]
266266
runs-on: ubuntu-latest
267267
steps:
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
name: Build DeepEP Docker Image
2+
3+
on:
4+
workflow_dispatch:
5+
schedule:
6+
- cron: '0 0 * * *'
7+
8+
jobs:
9+
build-dev:
10+
if: ${{ github.repository == 'sgl-project/sglang' }}
11+
runs-on: ubuntu-22.04
12+
steps:
13+
- name: Checkout repository
14+
uses: actions/checkout@v4
15+
16+
- name: Free disk space
17+
uses: jlumbroso/free-disk-space@main
18+
with:
19+
tool-cache: false
20+
docker-images: false
21+
android: true
22+
dotnet: true
23+
haskell: true
24+
large-packages: true
25+
swap-storage: false
26+
27+
- name: Login to Docker Hub
28+
uses: docker/login-action@v2
29+
with:
30+
username: ${{ secrets.DOCKERHUB_USERNAME }}
31+
password: ${{ secrets.DOCKERHUB_TOKEN }}
32+
33+
- name: Build and Push DeepEP Image
34+
run: |
35+
docker build . -f docker/Dockerfile.deepep -t lmsysorg/sglang:deepep --no-cache
36+
docker push lmsysorg/sglang:deepep
Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
# Copy from deepseek-ai/DeepEP/tests/test_utils.py
2+
3+
import os
4+
import sys
5+
from typing import Optional
6+
7+
import numpy as np
8+
import torch
9+
import torch.distributed as dist
10+
11+
12+
def init_dist(local_rank: int, num_local_ranks: int):
13+
# NOTES: you may rewrite this function with your own cluster settings
14+
ip = os.getenv("MASTER_ADDR", "127.0.0.1")
15+
port = int(os.getenv("MASTER_PORT", "8361"))
16+
num_nodes = int(os.getenv("WORLD_SIZE", 1))
17+
node_rank = int(os.getenv("RANK", 0))
18+
assert (num_local_ranks < 8 and num_nodes == 1) or num_local_ranks == 8
19+
20+
dist.init_process_group(
21+
backend="nccl",
22+
init_method=f"tcp://{ip}:{port}",
23+
world_size=num_nodes * num_local_ranks,
24+
rank=node_rank * num_local_ranks + local_rank,
25+
)
26+
torch.set_default_dtype(torch.bfloat16)
27+
torch.set_default_device("cuda")
28+
torch.cuda.set_device(local_rank)
29+
30+
return (
31+
dist.get_rank(),
32+
dist.get_world_size(),
33+
dist.new_group(list(range(num_local_ranks * num_nodes))),
34+
)
35+
36+
37+
def calc_diff(x: torch.Tensor, y: torch.Tensor):
38+
x, y = x.double() + 1, y.double() + 1
39+
denominator = (x * x + y * y).sum()
40+
sim = 2 * (x * y).sum() / denominator
41+
return (1 - sim).item()
42+
43+
44+
def per_token_cast_to_fp8(x: torch.Tensor):
45+
assert x.dim() == 2 and x.size(1) % 128 == 0
46+
m, n = x.shape
47+
x_view = x.view(m, -1, 128)
48+
x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
49+
return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(
50+
m, n
51+
), (x_amax / 448.0).view(m, -1)
52+
53+
54+
def per_token_cast_back(x_fp8: torch.Tensor, x_scales: torch.Tensor):
55+
x_fp32 = x_fp8.to(torch.float32).view(x_fp8.size(0), -1, 128)
56+
x_scales = x_scales.view(x_fp8.size(0), -1, 1)
57+
return (x_fp32 * x_scales).view(x_fp8.shape).to(torch.bfloat16)
58+
59+
60+
def inplace_unique(x: torch.Tensor, num_slots: int):
61+
assert x.dim() == 2
62+
mask = x < 0
63+
x_padded = x.masked_fill(mask, num_slots)
64+
bin_count = torch.zeros((x.size(0), num_slots + 1), dtype=x.dtype, device=x.device)
65+
bin_count.scatter_add_(1, x_padded, torch.ones_like(x_padded))
66+
bin_count = bin_count[:, :num_slots]
67+
sorted_bin_count, sorted_bin_idx = torch.sort(bin_count, dim=-1, descending=True)
68+
sorted_bin_idx.masked_fill_(sorted_bin_count == 0, -1)
69+
sorted_bin_idx = torch.sort(sorted_bin_idx, descending=True, dim=-1).values
70+
x[:, :].fill_(-1)
71+
valid_len = min(num_slots, x.size(1))
72+
x[:, :valid_len] = sorted_bin_idx[:, :valid_len]
73+
74+
75+
def create_grouped_scores(
76+
scores: torch.Tensor, group_idx: torch.Tensor, num_groups: int
77+
):
78+
num_tokens, num_experts = scores.shape
79+
scores = scores.view(num_tokens, num_groups, -1)
80+
mask = torch.zeros((num_tokens, num_groups), dtype=torch.bool, device=scores.device)
81+
mask = mask.scatter_(1, group_idx, True).unsqueeze(-1).expand_as(scores)
82+
return (scores * mask).view(num_tokens, num_experts)
83+
84+
85+
def bench(fn, num_warmups: int = 20, num_tests: int = 30, post_fn=None):
86+
# Flush L2 cache with 256 MB data
87+
torch.cuda.synchronize()
88+
cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")
89+
90+
# Warmup
91+
for _ in range(num_warmups):
92+
fn()
93+
94+
# Flush L2
95+
cache.zero_()
96+
97+
# Testing
98+
start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
99+
end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
100+
for i in range(num_tests):
101+
# Record
102+
start_events[i].record()
103+
fn()
104+
end_events[i].record()
105+
if post_fn is not None:
106+
post_fn()
107+
torch.cuda.synchronize()
108+
109+
times = np.array(
110+
[s.elapsed_time(e) / 1e3 for s, e in zip(start_events, end_events)]
111+
)[1:]
112+
return np.average(times), np.min(times), np.max(times)
113+
114+
115+
class empty_suppress:
116+
def __enter__(self):
117+
return self
118+
119+
def __exit__(self, *_):
120+
pass
121+
122+
123+
class suppress_stdout_stderr:
124+
def __enter__(self):
125+
self.outnull_file = open(os.devnull, "w")
126+
self.errnull_file = open(os.devnull, "w")
127+
128+
self.old_stdout_fileno_undup = sys.stdout.fileno()
129+
self.old_stderr_fileno_undup = sys.stderr.fileno()
130+
131+
self.old_stdout_fileno = os.dup(sys.stdout.fileno())
132+
self.old_stderr_fileno = os.dup(sys.stderr.fileno())
133+
134+
self.old_stdout = sys.stdout
135+
self.old_stderr = sys.stderr
136+
137+
os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
138+
os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
139+
140+
sys.stdout = self.outnull_file
141+
sys.stderr = self.errnull_file
142+
return self
143+
144+
def __exit__(self, *_):
145+
sys.stdout = self.old_stdout
146+
sys.stderr = self.old_stderr
147+
148+
os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
149+
os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
150+
151+
os.close(self.old_stdout_fileno)
152+
os.close(self.old_stderr_fileno)
153+
154+
self.outnull_file.close()
155+
self.errnull_file.close()
156+
157+
158+
def bench_kineto(
159+
fn,
160+
kernel_names,
161+
num_tests: int = 30,
162+
suppress_kineto_output: bool = False,
163+
trace_path: Optional[str] = None,
164+
barrier_comm_profiling: bool = False,
165+
):
166+
# Profile
167+
suppress = suppress_stdout_stderr if suppress_kineto_output else empty_suppress
168+
with suppress():
169+
schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1)
170+
with torch.profiler.profile(
171+
activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule
172+
) as prof:
173+
for i in range(2):
174+
# NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
175+
if barrier_comm_profiling:
176+
lhs = torch.randn((8192, 8192), dtype=torch.float, device="cuda")
177+
rhs = torch.randn((8192, 8192), dtype=torch.float, device="cuda")
178+
lhs @ rhs
179+
dist.all_reduce(torch.ones(1, dtype=torch.float, device="cuda"))
180+
for _ in range(num_tests):
181+
fn()
182+
prof.step()
183+
184+
# Parse the profiling table
185+
assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
186+
is_tupled = isinstance(kernel_names, tuple)
187+
prof_lines = (
188+
prof.key_averages()
189+
.table(sort_by="cuda_time_total", max_name_column_width=100)
190+
.split("\n")
191+
)
192+
kernel_names = (kernel_names,) if isinstance(kernel_names, str) else kernel_names
193+
assert all([isinstance(name, str) for name in kernel_names])
194+
for name in kernel_names:
195+
assert (
196+
sum([name in line for line in prof_lines]) == 1
197+
), f"Errors of the kernel {name} in the profiling table"
198+
199+
# Save chrome traces
200+
if trace_path is not None:
201+
prof.export_chrome_trace(trace_path)
202+
203+
# Return average kernel times
204+
units = {"ms": 1e3, "us": 1e6}
205+
kernel_times = []
206+
for name in kernel_names:
207+
for line in prof_lines:
208+
if name in line:
209+
time_str = line.split()[-2]
210+
for unit, scale in units.items():
211+
if unit in time_str:
212+
kernel_times.append(float(time_str.replace(unit, "")) / scale)
213+
break
214+
break
215+
return tuple(kernel_times) if is_tupled else kernel_times[0]
216+
217+
218+
def hash_tensor(t: torch.Tensor):
219+
return t.view(torch.int64).sum().item()

python/sglang/test/test_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
)
6767
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
6868
DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
69+
DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-V3-0324"
6970
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
7071
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
7172
)

0 commit comments

Comments
 (0)