Skip to content

Commit 93862fd

Browse files
authored
Merge branch 'develop' into chat_template_fix
2 parents f18a3b1 + 33c0197 commit 93862fd

37 files changed

+722
-485
lines changed

.github/workflows/_accuracy_test.yml

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
name: Accuracy Test
2+
description: "Run Accuracy Tests"
3+
4+
on:
5+
workflow_call:
6+
inputs:
7+
DOCKER_IMAGE:
8+
description: "Build Images"
9+
required: true
10+
type: string
11+
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310"
12+
FASTDEPLOY_ARCHIVE_URL:
13+
description: "URL of the compressed FastDeploy code archive."
14+
required: true
15+
type: string
16+
FASTDEPLOY_WHEEL_URL:
17+
description: "URL of the FastDeploy Wheel."
18+
required: true
19+
type: string
20+
CACHE_DIR:
21+
description: "Cache Dir Use"
22+
required: false
23+
type: string
24+
default: ""
25+
MODEL_CACHE_DIR:
26+
description: "Cache Dir Use"
27+
required: false
28+
type: string
29+
default: ""
30+
31+
jobs:
32+
accuracy_tests:
33+
runs-on: [self-hosted, GPU-h20-1Cards]
34+
steps:
35+
- name: Code Prepare
36+
shell: bash
37+
env:
38+
docker_image: ${{ inputs.DOCKER_IMAGE }}
39+
fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
40+
run: |
41+
set -x
42+
REPO="https://github.com/${{ github.repository }}.git"
43+
FULL_REPO="${{ github.repository }}"
44+
REPO_NAME="${FULL_REPO##*/}"
45+
BASE_BRANCH="${{ github.base_ref }}"
46+
47+
# Clean the repository directory before starting
48+
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
49+
-e "REPO_NAME=${REPO_NAME}" \
50+
${docker_image} /bin/bash -c '
51+
if [ -d ${REPO_NAME} ]; then
52+
echo "Directory ${REPO_NAME} exists, removing it..."
53+
rm -rf ${REPO_NAME}*
54+
fi
55+
'
56+
57+
wget -q ${fd_archive_url}
58+
tar -xf FastDeploy.tar.gz
59+
rm -rf FastDeploy.tar.gz
60+
cd FastDeploy
61+
git config --global user.name "FastDeployCI"
62+
git config --global user.email "fastdeploy_ci@example.com"
63+
git log -n 3 --oneline
64+
65+
- name: Run FastDeploy Base Tests
66+
shell: bash
67+
env:
68+
docker_image: ${{ inputs.DOCKER_IMAGE }}
69+
fastdeploy_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
70+
CACHE_DIR: ${{ inputs.CACHE_DIR }}
71+
MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
72+
run: |
73+
runner_name="${{ runner.name }}"
74+
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
75+
DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
76+
DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
77+
78+
FLASK_PORT=$((42068 + DEVICE_PORT * 100))
79+
FD_API_PORT=$((42088 + DEVICE_PORT * 100))
80+
FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
81+
FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
82+
echo "Test ENV Parameter:"
83+
echo "========================================================="
84+
echo "FLASK_PORT=${FLASK_PORT}"
85+
echo "FD_API_PORT=${FD_API_PORT}"
86+
echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
87+
echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
88+
echo "DEVICES=${DEVICES}"
89+
echo "========================================================="
90+
91+
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
92+
echo "CACHE_DIR is set to ${CACHE_DIR}"
93+
if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
94+
touch "${CACHE_DIR}/gitconfig"
95+
fi
96+
if [ ! -d "${MODEL_CACHE_DIR}" ]; then
97+
echo "Error: MODEL_CACHE_DIR '${MODEL_CACHE_DIR}' does not exist."
98+
exit 1
99+
fi
100+
101+
PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT)
102+
LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
103+
echo "==== LOG_FILE is ${LOG_FILE} ===="
104+
105+
echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
106+
107+
for port in "${PORTS[@]}"; do
108+
PIDS=$(lsof -t -i :$port || true)
109+
if [ -n "$PIDS" ]; then
110+
echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
111+
echo "$PIDS" | xargs -r kill -9
112+
echo "Port $port cleared" | tee -a $LOG_FILE
113+
else
114+
echo "Port $port is free" | tee -a $LOG_FILE
115+
fi
116+
done
117+
118+
echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
119+
120+
docker run --rm --ipc=host --pid=host --net=host \
121+
-v $(pwd):/workspace \
122+
-w /workspace \
123+
-e fastdeploy_wheel_url=${fastdeploy_wheel_url} \
124+
-e "FD_API_PORT=${FD_API_PORT}" \
125+
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
126+
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
127+
-e "FLASK_PORT=${FLASK_PORT}" \
128+
-v "${MODEL_CACHE_DIR}:/MODELDATA" \
129+
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
130+
-v "${CACHE_DIR}/.cache:/root/.cache" \
131+
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
132+
-e TZ="Asia/Shanghai" \
133+
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
134+
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
135+
136+
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
137+
138+
python -m pip install ${fastdeploy_wheel_url}
139+
python -m pip install pytest
140+
141+
wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
142+
chmod +x ./llm-deploy-linux-amd64
143+
./llm-deploy-linux-amd64 -python python3.10 \
144+
-model_name ERNIE-4.5-0.3B-Paddle \
145+
-model_path /MODELDATA \
146+
--skip install
147+
148+
git config --global --add safe.directory /workspace/FastDeploy
149+
cd FastDeploy
150+
pushd test/ce/deploy
151+
python3.10 deploy.py > dd.log 2>&1 &
152+
sleep 3
153+
curl -X POST http://0.0.0.0:${FLASK_PORT}/start \
154+
-H "Content-Type: application/json" \
155+
-d "{\"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\"}"
156+
157+
curl -X POST http://localhost:${FLASK_PORT}/wait_for_infer?timeout=90
158+
popd
159+
160+
pushd test/ce/accuracy_cases
161+
export URL=http://localhost:${FD_API_PORT}/v1/chat/completions
162+
export TEMPLATE=TOKEN_LOGPROB
163+
export MODEL_SIZE=0.3B
164+
TEST_EXIT_CODE=0
165+
python gsm8k.py || TEST_EXIT_CODE=1
166+
popd
167+
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> /workspace/FastDeploy/exit_code.env
168+
'
169+
if [ -f ./FastDeploy/exit_code.env ]; then
170+
source ./FastDeploy/exit_code.env
171+
cat ./FastDeploy/exit_code.env >> $GITHUB_ENV
172+
fi
173+
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}"
174+
exit ${TEST_EXIT_CODE}

.github/workflows/pr_build_and_test.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,13 @@ jobs:
7373
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
7474
FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
7575
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
76+
77+
accuracy_test:
78+
name: Run Accuracy Tests
79+
needs: [clone,build]
80+
uses: ./.github/workflows/_accuracy_test.yml
81+
with:
82+
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
83+
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
84+
FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
85+
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"

README.md

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,11 @@ English | [简体中文](README_CN.md)
2323
</p>
2424

2525
--------------------------------------------------------------------------------
26-
# FastDeploy 2.1: Inference and Deployment Toolkit for LLMs and VLMs based on PaddlePaddle
26+
# FastDeploy : Inference and Deployment Toolkit for LLMs and VLMs based on PaddlePaddle
2727

2828
## News
2929
**[2025-08] 🔥 Released FastDeploy v2.1:** A brand-new KV Cache scheduling strategy has been introduced, and expanded support for PD separation and CUDA Graph across more models. Enhanced hardware support has been added for platforms like Kunlun and Hygon, along with comprehensive optimizations to improve the performance of both the service and inference engine.
3030

31-
**[2025-07] 《FastDeploy2.0推理部署实测》专题活动已上线!** 完成文心4.5系列开源模型的推理部署等任务,即可获得骨瓷马克杯等FastDeploy2.0官方周边及丰富奖金!🎁 欢迎大家体验反馈~ 📌[报名地址](https://www.wjx.top/vm/meSsp3L.aspx#) 📌[活动详情](https://github.com/PaddlePaddle/FastDeploy/discussions/2728)
32-
3331
**[2025-07] The FastDeploy 2.0 Inference Deployment Challenge is now live!** Complete the inference deployment task for the ERNIE 4.5 series open-source models to win official FastDeploy 2.0 merch and generous prizes! 🎁 You're welcome to try it out and share your feedback! 📌[Sign up here](https://www.wjx.top/vm/meSsp3L.aspx#) 📌[Event details](https://github.com/PaddlePaddle/FastDeploy/discussions/2728)
3432

3533
**[2025-06] 🔥 Released FastDeploy v2.0:** Supports inference and deployment for ERNIE 4.5. Furthermore, we open-source an industrial-grade PD disaggregation with context caching, dynamic role switching for effective resource utilization to further enhance inference performance for MoE models.
@@ -52,14 +50,15 @@ English | [简体中文](README_CN.md)
5250

5351
## Installation
5452

55-
FastDeploy supports inference deployment on **NVIDIA GPUs**, **Kunlunxin XPUs**, **Iluvatar GPUs**, **Enflame GCUs**, and other hardware. For detailed installation instructions:
53+
FastDeploy supports inference deployment on **NVIDIA GPUs**, **Kunlunxin XPUs**, **Iluvatar GPUs**, **Enflame GCUs**, **Hygon DCUs** and other hardware. For detailed installation instructions:
5654

5755
- [NVIDIA GPU](./docs/get_started/installation/nvidia_gpu.md)
5856
- [Kunlunxin XPU](./docs/get_started/installation/kunlunxin_xpu.md)
5957
- [Iluvatar GPU](./docs/get_started/installation/iluvatar_gpu.md)
6058
- [Enflame GCU](./docs/get_started/installation/Enflame_gcu.md)
59+
- [Hygon DCU](./docs/get_started/installation/hygon_dcu.md)
6160

62-
**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU, Hygon DCU, and MetaX GPU are currently under development and testing. Stay tuned for updates!
61+
**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU and MetaX GPU are currently under development and testing. Stay tuned for updates!
6362

6463
## Get Started
6564

README_CN.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
</p>
2424

2525
--------------------------------------------------------------------------------
26-
# FastDeploy 2.1:基于飞桨的大语言模型与视觉语言模型推理部署工具包
26+
# FastDeploy :基于飞桨的大语言模型与视觉语言模型推理部署工具包
2727

2828
## 最新活动
2929
**[2025-08] 🔥 FastDeploy v2.1 全新发布:** 全新的KV Cache调度策略,更多模型支持PD分离和CUDA Graph,昆仑、海光等更多硬件支持增强,全方面优化服务和推理引擎的性能。
@@ -48,14 +48,15 @@
4848

4949
## 安装
5050

51-
FastDeploy 支持在**英伟达(NVIDIA)GPU****昆仑芯(Kunlunxin)XPU****天数(Iluvatar)GPU****燧原(Enflame)GCU** 以及其他硬件上进行推理部署。详细安装说明如下:
51+
FastDeploy 支持在**英伟达(NVIDIA)GPU****昆仑芯(Kunlunxin)XPU****天数(Iluvatar)GPU****燧原(Enflame)GCU****海光(Hygon)DCU** 以及其他硬件上进行推理部署。详细安装说明如下:
5252

5353
- [英伟达 GPU](./docs/zh/get_started/installation/nvidia_gpu.md)
5454
- [昆仑芯 XPU](./docs/zh/get_started/installation/kunlunxin_xpu.md)
5555
- [天数 CoreX](./docs/zh/get_started/installation/iluvatar_gpu.md)
5656
- [燧原 S60](./docs/zh/get_started/installation/Enflame_gcu.md)
57+
- [海光 DCU](./docs/zh/get_started/installation/hygon_dcu.md)
5758

58-
**注意:** 我们正在积极拓展硬件支持范围。目前,包括昇腾(Ascend)NPU、海光(Hygon)DCU 和摩尔线程(MetaX)GPU 在内的其他硬件平台正在开发测试中。敬请关注更新!
59+
**注意:** 我们正在积极拓展硬件支持范围。目前,包括昇腾(Ascend)NPU 和 沐曦(MetaX)GPU 在内的其他硬件平台正在开发测试中。敬请关注更新!
5960

6061
## 入门指南
6162

dockerfiles/Dockerfile.gpu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.0.0
1+
FROM ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.1.0
22
ARG PADDLE_VERSION=3.1.1
33
ARG FD_VERSION=2.1.0
44

docs/best_practices/ERNIE-4.5-0.3B-Paddle.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ Add the following lines to the startup parameters
7676
--use-cudagraph
7777
```
7878
Notes:
79-
1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../parameters.md) for related configuration parameter descriptions
79+
1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../features/graph_optimization.md) for related configuration parameter descriptions
8080
2. When CUDAGraph is enabled, if running with multi-GPUs TP>1, `--enable-custom-all-reduce` must be specified at the same time.
8181
3. When CUDAGraph is enabled, the scenario of `max-model-len > 32768` is not currently supported.
8282

docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ Add the following lines to the startup parameters
8686
--use-cudagraph
8787
```
8888
Notes:
89-
1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../parameters.md) for related configuration parameter descriptions
89+
1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../features/graph_optimization.md) for related configuration parameter descriptions
9090
2. When CUDAGraph is enabled, if running with multi-GPUs TP>1, `--enable-custom-all-reduce` must be specified at the same time.
9191
3. When CUDAGraph is enabled, the scenario of `max-model-len > 32768` is not currently supported.
9292

@@ -111,7 +111,6 @@ export INFERENCE_MSG_QUEUE_ID=1315
111111
export FLAGS_max_partition_size=2048
112112
export FD_ATTENTION_BACKEND=FLASH_ATTN
113113
export FD_LOG_DIR="prefill_log"
114-
export ENABLE_V1_KVCACHE_SCHEDULER=1
115114
116115
quant_type=block_wise_fp8
117116
export FD_USE_DEEP_GEMM=0
@@ -121,7 +120,7 @@ python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A
121120
--max-num-seqs 20 \
122121
--num-gpu-blocks-override 40000 \
123122
--quantization ${quant_type} \
124-
--gpu-memory-utilization 0.9 \
123+
--gpu-memory-utilization 0.9 --kv-cache-ratio 0.9 \
125124
--port 7012 --engine-worker-queue-port 7013 --metrics-port 7014 --tensor-parallel-size 4 \
126125
--cache-queue-port 7015 \
127126
--splitwise-role "prefill" \
@@ -132,7 +131,6 @@ export CUDA_VISIBLE_DEVICES=4,5,6,7
132131
export INFERENCE_MSG_QUEUE_ID=1215
133132
export FLAGS_max_partition_size=2048
134133
export FD_LOG_DIR="decode_log"
135-
export ENABLE_V1_KVCACHE_SCHEDULER=1
136134
137135
quant_type=block_wise_fp8
138136
export FD_USE_DEEP_GEMM=0
@@ -141,7 +139,7 @@ python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A
141139
--max-model-len 131072 \
142140
--max-num-seqs 20 \
143141
--quantization ${quant_type} \
144-
--gpu-memory-utilization 0.85 \
142+
--gpu-memory-utilization 0.85 --kv-cache-ratio 0.1 \
145143
--port 9012 --engine-worker-queue-port 8013 --metrics-port 8014 --tensor-parallel-size 4 \
146144
--cache-queue-port 8015 \
147145
--innode-prefill-ports 7013 \

docs/best_practices/ERNIE-4.5-300B-A47B-Paddle.md

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,6 @@ export FD_SAMPLING_CLASS=rejection
9999
**How to enable:** Take the deployment of a single machine with 8 GPUs and 1P1D (4 GPUs each) as an example. Compared with the default hybrid deployment method, `--splitwise-role` is required to specify the role of the node. And the GPUs and logs of the two nodes are isolated through the environment variables `FD_LOG_DIR` and `CUDA_VISIBLE_DEVICES`.
100100
```
101101
export FD_LOG_DIR="log_prefill"
102-
export ENABLE_V1_KVCACHE_SCHEDULER=1
103102
export CUDA_VISIBLE_DEVICES=0,1,2,3
104103
python -m fastdeploy.entrypoints.openai.api_server \
105104
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
@@ -112,7 +111,6 @@ python -m fastdeploy.entrypoints.openai.api_server \
112111
```
113112
```
114113
export FD_LOG_DIR="log_decode"
115-
export ENABLE_V1_KVCACHE_SCHEDULER=1
116114
export CUDA_VISIBLE_DEVICES=4,5,6,7
117115
# Note that innode-prefill-ports is specified as the Prefill serviceengine-worker-queue-port
118116
python -m fastdeploy.entrypoints.openai.api_server \
@@ -137,7 +135,7 @@ Add the following lines to the startup parameters
137135
--enable-custom-all-reduce
138136
```
139137
Notes:
140-
1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../parameters.md) for related configuration parameter descriptions
138+
1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../features/graph_optimization.md) for related configuration parameter descriptions
141139
2. When CUDAGraph is enabled, if running with multi-GPUs TP>1, `--enable-custom-all-reduce` must be specified at the same time.
142140
3. When CUDAGraph is enabled, the scenario of `max-model-len > 32768` is not currently supported.
143141

0 commit comments

Comments
 (0)