Skip to content

Commit f75d8eb

Browse files
committed
[GCU] Enable CI
1 parent 60874b4 commit f75d8eb

File tree

3 files changed

+81
-38
lines changed

3 files changed

+81
-38
lines changed

.github/workflows/ci_gcu.yml

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@ jobs:
2929
REPO_NAME="${FULL_REPO##*/}"
3030
BASE_BRANCH="${{ github.base_ref }}"
3131
# Clean the repository directory before starting
32-
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
32+
docker run --rm --net=host -v $(pwd):/workspace \
33+
-v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
34+
-w /workspace \
3335
-e "REPO_NAME=${REPO_NAME}" \
3436
-e "BASE_BRANCH=${BASE_BRANCH}" \
3537
${docker_image} /bin/bash -c '
@@ -40,6 +42,7 @@ jobs:
4042
'
4143
git config --global user.name "FastDeployCI"
4244
git config --global user.email "fastdeploy_ci@example.com"
45+
source ${{ github.workspace }}/../../../proxy
4346
git clone ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
4447
cd FastDeploy
4548
if [ "${{ github.event_name }}" = "pull_request" ]; then
@@ -50,6 +53,9 @@ jobs:
5053
git checkout ${{ github.sha }}
5154
git log -n 3 --oneline
5255
fi
56+
echo "Copy models..."
57+
sudo mkdir -p ci_models && sudo cp -r /work/deps/ERNIE-4.5-21B-A3B-Paddle ci_models
58+
echo "Copy models done."
5359
5460
- name: Run CI unittest
5561
env:
@@ -71,19 +77,21 @@ jobs:
7177
echo "PARENT_DIR:$PARENT_DIR"
7278
echo "Install drivers..."
7379
cd /work/deps
74-
bash TopsRider_i3x_*_deb_amd64.run --driver --no-auto-load -y
80+
sudo bash TopsRider_i3x_*_deb_amd64.run --driver --no-auto-load -y
7581
cd -
76-
docker run --rm --network=host --ipc=host -it --privileged \
77-
-v $(pwd):/workspace -w /workspace \
78-
-v "/home:/home" \
79-
-v "/work:/work" \
80-
-e "MODEL_PATH=/work/models" \
82+
echo "Create docker..."
83+
docker run --rm --network=host --ipc=host --privileged \
84+
-v $(pwd):/workspace \
85+
-v /home:/home \
86+
-v /work:/work \
87+
-w /workspace \
88+
-e "MODEL_PATH=./ci_models" \
8189
-e "http_proxy=$(git config --global --get http.proxy)" \
8290
-e "https_proxy=$(git config --global --get https.proxy)" \
8391
-e "FD_API_PORT=${FD_API_PORT}" \
8492
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
8593
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
86-
${docker_image} /bin/bash -c "
94+
${docker_image} /bin/bash -c "
8795
git config --global --add safe.directory /workspace/FastDeploy
8896
cd FastDeploy
8997
bash scripts/run_ci_gcu.sh

scripts/run_ci_gcu.sh

Lines changed: 55 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,42 @@
1-
#!/bin/bash
1+
#!/usr/bin/env bash
22
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
3-
echo "$DIR"
3+
echo "Current directory: ${DIR}"
44

5-
#先kill一遍
6-
ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
7-
ps -efww | grep -E '8188' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
8-
lsof -t -i :8188 | xargs kill -9 || true
5+
function stop_processes() {
6+
fastdeploy_python_pids=$(ps -ef | grep "python" | grep -v grep | awk '{print $2}')
7+
echo "Process to stop:"
8+
echo $fastdeploy_python_pids
9+
for in_pid in ${fastdeploy_python_pids[@]}; do
10+
kill -9 ${in_pid}
11+
done
12+
}
913

10-
export model_path=${MODEL_PATH}/paddle/ERNIE-4.5-21B-A3B-Paddle
14+
echo "Clean up processes..."
15+
stop_processes
16+
echo "Clean up completed."
17+
18+
export model_path=${MODEL_PATH}/ERNIE-4.5-21B-A3B-Paddle
1119

1220
echo "pip install requirements"
1321
python -m pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
1422
echo "uninstall org"
1523
python -m pip uninstall paddlepaddle -y
1624
python -m pip uninstall paddle-custom-gcu -y
1725
python -m pip install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
26+
python -m pip install --pre paddle-custom-gcu==3.0.0.dev20250801 -i https://www.paddlepaddle.org.cn/packages/nightly/gcu/
1827
echo "build whl"
1928
bash build.sh 1 || exit 1
2029

2130
unset http_proxy
2231
unset https_proxy
2332
unset no_proxy
2433

25-
# 起服务
2634
rm -rf log/*
2735
rm -f core*
28-
# pkill -9 python #流水线不执行这个
29-
#清空消息队列
36+
37+
# Empty the message queue
3038
ipcrm --all=msg
39+
echo "Start server..."
3140
python -m fastdeploy.entrypoints.openai.api_server \
3241
--model ${model_path} \
3342
--port 8188 \
@@ -38,21 +47,40 @@ python -m fastdeploy.entrypoints.openai.api_server \
3847
--max-num-seqs 8 \
3948
--quantization wint4 > server.log 2>&1 &
4049

41-
sleep 60
42-
# 探活
43-
TIMEOUT=$((5 * 60))
44-
INTERVAL=10 # 检查间隔(秒)
50+
echo "Waiting 90 seconds..."
51+
sleep 90
52+
53+
if grep -q "Failed to launch worker processes" server.log; then
54+
echo "Failed to launch worker processes..."
55+
stop_processes
56+
cat server.log
57+
cat log/workerlog.0
58+
exit 1
59+
fi
60+
61+
if grep -q "Traceback (most recent call last):" server.log; then
62+
echo "Some errors occurred..."
63+
stop_processes
64+
cat server.log
65+
cat log/workerlog.0
66+
exit 1
67+
fi
68+
69+
# Health check
70+
TIMEOUT=$((11 * 60))
71+
INTERVAL=30 # Check interval (seconds)
4572
ENDPOINT="http://0.0.0.0:8188/health"
46-
START_TIME=$(date +%s) # 记录开始时间戳
47-
echo "开始服务健康检查,最长等待时间:${TIMEOUT}"
73+
START_TIME=$(date +%s) # Record the start timestamp
74+
echo "Start the server health check, maximum waiting time: ${TIMEOUT} seconds..."
4875
while true; do
49-
# 计算已耗时
76+
# Used to calculate the time cost
5077
CURRENT_TIME=$(date +%s)
5178
ELAPSED=$((CURRENT_TIME - START_TIME))
5279

53-
# 超时判断
80+
# Timeout
5481
if [ $ELAPSED -ge $TIMEOUT ]; then
55-
echo -e "\n服务启动超时:经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
82+
echo -e "\nServer start timeout: After $((TIMEOUT/60)) minutes, the service still doesn't start!"
83+
stop_processes
5684
cat server.log
5785
cat log/workerlog.0
5886
exit 1
@@ -61,26 +89,27 @@ while true; do
6189
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
6290

6391
if [ "$HTTP_CODE" = "200" ]; then
64-
echo -e "\n服务启动成功!耗时 ${ELAPSED}"
92+
echo -e "\nThe server was successfully launched! Totally takes $((ELAPSED+90)) seconds."
6593
break
6694
else
6795
sleep $INTERVAL
6896
fi
6997
done
7098

7199
cat server.log
100+
echo -e "\n"
72101

73-
# 执行服务化推理
102+
echo "Start inference..."
74103
python test/ci_use/GCU/run_ernie.py
75104
exit_code=$?
76-
echo exit_code is ${exit_code}
105+
echo -e "exit_code is ${exit_code}.\n"
77106

78-
ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
79-
ps -efww | grep -E '8188' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
80-
lsof -t -i :8188 | xargs kill -9 || true
107+
echo "Stop server..."
108+
stop_processes
109+
echo "Stop server done."
81110

82111
if [ ${exit_code} -ne 0 ]; then
83-
echo "log/workerlog.0"
112+
echo "Exit with error, please refer to log/workerlog.0"
84113
cat log/workerlog.0
85114
exit 1
86115
fi

test/ci_use/GCU/run_ernie.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,24 @@
1515
import openai
1616

1717
ip = "0.0.0.0"
18-
service_http_port = "8188" # 服务配置的
18+
service_http_port = "8188"
1919
client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY")
2020

21-
# 非流式对话
2221
response = client.chat.completions.create(
2322
model="default",
2423
messages=[
2524
{"role": "user", "content": "The largest ocean is"},
2625
],
2726
temperature=1,
2827
top_p=0,
29-
max_tokens=64,
28+
max_tokens=256,
3029
stream=False,
3130
)
32-
print(response)
31+
print(f"response is: {response}", flush=True)
32+
33+
generate_context = response.choices[0].message.content
34+
print(f"\ngenerate_context is: {generate_context}", flush=True)
35+
36+
assert "pacific ocean" in generate_context.lower(), "The answer was incorrect!"
37+
38+
print("Test successfully!", flush=True)

0 commit comments

Comments
 (0)