Skip to content

Commit 045efaf

Browse files
committed
add test for CustomAllreduce
1 parent 0b77d39 commit 045efaf

File tree

2 files changed

+121
-0
lines changed

2 files changed

+121
-0
lines changed

test/distributed/custom_all_reduce.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
"""
2+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""
16+
17+
import os
18+
import unittest
19+
20+
import numpy as np
21+
import paddle
22+
import paddle.distributed as dist
23+
from paddle.distributed import fleet
24+
25+
from fastdeploy.distributed.custom_all_reduce import CustomAllreduce
26+
27+
28+
class Test(unittest.TestCase):
29+
def setUp(self):
30+
"""
31+
Initialize the test environment,
32+
including setting random seeds.
33+
"""
34+
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
35+
paddle.seed(2025)
36+
strategy = fleet.DistributedStrategy()
37+
strategy.hybrid_configs = {
38+
"dp_degree": 1,
39+
"mp_degree": 2,
40+
"pp_degree": 1,
41+
"sharding_degree": 1,
42+
}
43+
44+
fleet.init(is_collective=True, strategy=strategy)
45+
46+
def test_case(self):
47+
"""
48+
Check if the CustomAllreduce function works properly.
49+
"""
50+
51+
mns = [[1, 2048], [2, 4096], [20, 4096], [128, 4096], [256, 4096], [256, 8192]]
52+
53+
hcg = fleet.get_hybrid_communicate_group()
54+
model_parallel_group = hcg.get_model_parallel_group()
55+
fa = CustomAllreduce(model_parallel_group)
56+
57+
for m, n in mns:
58+
data_ar = paddle.rand([m, n], dtype="bfloat16")
59+
data_paddle = data_ar.clone()
60+
if fa.should_custom_ar(data_ar):
61+
fa.custom_all_reduce(data_ar)
62+
dist.all_reduce(data_paddle)
63+
if dist.get_rank() == 0:
64+
np.testing.assert_allclose(
65+
data_ar.numpy(),
66+
data_paddle.numpy(),
67+
rtol=1e-04,
68+
atol=1e-04,
69+
)
70+
71+
72+
if __name__ == "__main__":
73+
unittest.main()
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
import subprocess
17+
import sys
18+
19+
20+
def test_custom_all_reduce_launch():
21+
"""
22+
test_custom_all_reduce
23+
"""
24+
current_dir = os.path.dirname(os.path.abspath(__file__))
25+
custom_all_reduce_script = os.path.join(current_dir, "custom_all_reduce.py")
26+
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
27+
command = [
28+
sys.executable,
29+
"-m",
30+
"paddle.distributed.launch",
31+
"--gpus",
32+
"0,1",
33+
custom_all_reduce_script,
34+
]
35+
36+
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
37+
38+
try:
39+
stdout, stderr = process.communicate(timeout=400)
40+
return_code = process.returncode
41+
except subprocess.TimeoutExpired:
42+
process.kill()
43+
stdout, stderr = process.communicate()
44+
return_code = -1
45+
assert return_code == 0, f"Process exited with code {return_code}"
46+
47+
48+
test_custom_all_reduce_launch()

0 commit comments

Comments
 (0)