Skip to content

Commit 5d9aa0c

Browse files
committed
[GCU] Enable CI
1 parent eba8f41 commit 5d9aa0c

File tree

3 files changed

+78
-38
lines changed

3 files changed

+78
-38
lines changed

.github/workflows/ci_gcu.yml

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@ jobs:
2929
REPO_NAME="${FULL_REPO##*/}"
3030
BASE_BRANCH="${{ github.base_ref }}"
3131
# Clean the repository directory before starting
32-
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
32+
docker run --rm --net=host -v $(pwd):/workspace \
33+
-v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
34+
-w /workspace \
3335
-e "REPO_NAME=${REPO_NAME}" \
3436
-e "BASE_BRANCH=${BASE_BRANCH}" \
3537
${docker_image} /bin/bash -c '
@@ -40,6 +42,7 @@ jobs:
4042
'
4143
git config --global user.name "FastDeployCI"
4244
git config --global user.email "fastdeploy_ci@example.com"
45+
source ${{ github.workspace }}/../../../proxy
4346
git clone ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
4447
cd FastDeploy
4548
if [ "${{ github.event_name }}" = "pull_request" ]; then
@@ -50,6 +53,9 @@ jobs:
5053
git checkout ${{ github.sha }}
5154
git log -n 3 --oneline
5255
fi
56+
echo "Copy models..."
57+
sudo mkdir -p ci_models && sudo cp -r /work/deps/ERNIE-4.5-21B-A3B-Paddle ci_models
58+
echo "Copy models done."
5359
5460
- name: Run CI unittest
5561
env:
@@ -71,19 +77,21 @@ jobs:
7177
echo "PARENT_DIR:$PARENT_DIR"
7278
echo "Install drivers..."
7379
cd /work/deps
74-
bash TopsRider_i3x_*_deb_amd64.run --driver --no-auto-load -y
80+
sudo bash TopsRider_i3x_*_deb_amd64.run --driver --no-auto-load -y
7581
cd -
76-
docker run --rm --network=host --ipc=host -it --privileged \
77-
-v $(pwd):/workspace -w /workspace \
78-
-v "/home:/home" \
79-
-v "/work:/work" \
80-
-e "MODEL_PATH=/work/models" \
82+
echo "Create docker..."
83+
docker run --rm --network=host --ipc=host --privileged \
84+
-v $(pwd):/workspace \
85+
-v /home:/home \
86+
-v /work:/work \
87+
-w /workspace \
88+
-e "MODEL_PATH=./ci_models" \
8189
-e "http_proxy=$(git config --global --get http.proxy)" \
8290
-e "https_proxy=$(git config --global --get https.proxy)" \
8391
-e "FD_API_PORT=${FD_API_PORT}" \
8492
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
8593
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
86-
${docker_image} /bin/bash -c "
94+
${docker_image} /bin/bash -c "
8795
git config --global --add safe.directory /workspace/FastDeploy
8896
cd FastDeploy
8997
bash scripts/run_ci_gcu.sh

scripts/run_ci_gcu.sh

Lines changed: 52 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,39 @@
1-
#!/bin/bash
1+
#!/usr/bin/env bash
22
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
3-
echo "$DIR"
3+
echo "Current directory: ${DIR}"
44

5-
#先kill一遍
6-
ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
7-
ps -efww | grep -E '8188' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
8-
lsof -t -i :8188 | xargs kill -9 || true
5+
function stop_processes() {
6+
ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
7+
ps -efww | grep -E '8188' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
8+
lsof -t -i :8188 | xargs kill -9 || true
9+
}
910

10-
export model_path=${MODEL_PATH}/paddle/ERNIE-4.5-21B-A3B-Paddle
11+
echo "Clean up processes..."
12+
stop_processes
13+
echo "Clean up completed."
14+
15+
export model_path=${MODEL_PATH}/ERNIE-4.5-21B-A3B-Paddle
1116

1217
echo "pip install requirements"
1318
python -m pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
1419
echo "uninstall org"
1520
python -m pip uninstall paddlepaddle -y
1621
python -m pip uninstall paddle-custom-gcu -y
1722
python -m pip install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
23+
python -m pip install --pre paddle-custom-gcu==3.0.0.dev20250801 -i https://www.paddlepaddle.org.cn/packages/nightly/gcu/
1824
echo "build whl"
1925
bash build.sh 1 || exit 1
2026

2127
unset http_proxy
2228
unset https_proxy
2329
unset no_proxy
2430

25-
# 起服务
2631
rm -rf log/*
2732
rm -f core*
28-
# pkill -9 python #流水线不执行这个
29-
#清空消息队列
33+
34+
# Empty the message queue
3035
ipcrm --all=msg
36+
echo "Start server..."
3137
python -m fastdeploy.entrypoints.openai.api_server \
3238
--model ${model_path} \
3339
--port 8188 \
@@ -38,21 +44,40 @@ python -m fastdeploy.entrypoints.openai.api_server \
3844
--max-num-seqs 8 \
3945
--quantization wint4 > server.log 2>&1 &
4046

41-
sleep 60
42-
# 探活
43-
TIMEOUT=$((5 * 60))
44-
INTERVAL=10 # 检查间隔(秒)
47+
echo "Waiting 90 seconds..."
48+
sleep 90
49+
50+
if grep -q "Failed to launch worker processes" server.log; then
51+
echo "Failed to launch worker processes..."
52+
stop_processes
53+
cat server.log
54+
cat log/workerlog.0
55+
exit 1
56+
fi
57+
58+
if grep -q "Traceback (most recent call last):" server.log; then
59+
echo "Some errors occurred..."
60+
stop_processes
61+
cat server.log
62+
cat log/workerlog.0
63+
exit 1
64+
fi
65+
66+
# Health check
67+
TIMEOUT=$((11 * 60))
68+
INTERVAL=30 # Check interval (seconds)
4569
ENDPOINT="http://0.0.0.0:8188/health"
46-
START_TIME=$(date +%s) # 记录开始时间戳
47-
echo "开始服务健康检查,最长等待时间:${TIMEOUT}"
70+
START_TIME=$(date +%s) # Record the start timestamp
71+
echo "Start the server health check, maximum waiting time: ${TIMEOUT} seconds..."
4872
while true; do
49-
# 计算已耗时
73+
# Used to calculate the time cost
5074
CURRENT_TIME=$(date +%s)
5175
ELAPSED=$((CURRENT_TIME - START_TIME))
5276

53-
# 超时判断
77+
# Timeout
5478
if [ $ELAPSED -ge $TIMEOUT ]; then
55-
echo -e "\n服务启动超时:经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
79+
echo -e "\nServer start timeout: After $((TIMEOUT/60)) minutes, the service still doesn't start!"
80+
stop_processes
5681
cat server.log
5782
cat log/workerlog.0
5883
exit 1
@@ -61,26 +86,27 @@ while true; do
6186
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
6287

6388
if [ "$HTTP_CODE" = "200" ]; then
64-
echo -e "\n服务启动成功!耗时 ${ELAPSED}"
89+
echo -e "\nThe server was successfully launched! Totally takes $((ELAPSED+90)) seconds."
6590
break
6691
else
6792
sleep $INTERVAL
6893
fi
6994
done
7095

7196
cat server.log
97+
echo -e "\n"
7298

73-
# 执行服务化推理
99+
echo "Start inference..."
74100
python test/ci_use/GCU/run_ernie.py
75101
exit_code=$?
76-
echo exit_code is ${exit_code}
102+
echo -e "exit_code is ${exit_code}.\n"
77103

78-
ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
79-
ps -efww | grep -E '8188' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
80-
lsof -t -i :8188 | xargs kill -9 || true
104+
echo "Stop server..."
105+
stop_processes
106+
echo "Stop server done."
81107

82108
if [ ${exit_code} -ne 0 ]; then
83-
echo "log/workerlog.0"
109+
echo "Exit with error, please refer to log/workerlog.0"
84110
cat log/workerlog.0
85111
exit 1
86112
fi

test/ci_use/GCU/run_ernie.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,24 @@
1515
import openai
1616

1717
ip = "0.0.0.0"
18-
service_http_port = "8188" # 服务配置的
18+
service_http_port = "8188"
1919
client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY")
2020

21-
# 非流式对话
2221
response = client.chat.completions.create(
2322
model="default",
2423
messages=[
2524
{"role": "user", "content": "The largest ocean is"},
2625
],
2726
temperature=1,
2827
top_p=0,
29-
max_tokens=64,
28+
max_tokens=256,
3029
stream=False,
3130
)
32-
print(response)
31+
print(f"response is: {response}", flush=True)
32+
33+
generate_context = response.choices[0].message.content
34+
print(f"\ngenerate_context is: {generate_context}", flush=True)
35+
36+
assert "pacific ocean" in generate_context.lower(), "The answer was incorrect!"
37+
38+
print("Test successfully!", flush=True)

0 commit comments

Comments
 (0)