PaddlePaddle
diff --git a/‎.github/workflows/_accuracy_test.yml
Lines changed: 174 additions & 0 deletions b/‎.github/workflows/_accuracy_test.yml
Lines changed: 174 additions & 0 deletions
diff --git a/‎.github/workflows/pr_build_and_test.yml
Lines changed: 10 additions & 0 deletions b/‎.github/workflows/pr_build_and_test.yml
Lines changed: 10 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 4 additions & 5 deletions b/‎README.md
Lines changed: 4 additions & 5 deletions
diff --git a/‎README_CN.md
Lines changed: 4 additions & 3 deletions b/‎README_CN.md
Lines changed: 4 additions & 3 deletions
diff --git a/‎dockerfiles/Dockerfile.gpu
Lines changed: 1 addition & 1 deletion b/‎dockerfiles/Dockerfile.gpu
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/best_practices/ERNIE-4.5-0.3B-Paddle.md
Lines changed: 1 addition & 1 deletion b/‎docs/best_practices/ERNIE-4.5-0.3B-Paddle.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md
Lines changed: 3 additions & 5 deletions b/‎docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md
Lines changed: 3 additions & 5 deletions
diff --git a/‎docs/best_practices/ERNIE-4.5-300B-A47B-Paddle.md
Lines changed: 1 addition & 3 deletions b/‎docs/best_practices/ERNIE-4.5-300B-A47B-Paddle.md
Lines changed: 1 addition & 3 deletions
@@ -0,0 +1,174 @@
+name: Accuracy Test
+description: "Run Accuracy Tests"
+
+on:
+  workflow_call:
+    inputs:
+      DOCKER_IMAGE:
+        description: "Build Images"
+        required: true
+        type: string
+        default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310"
+      FASTDEPLOY_ARCHIVE_URL:
+        description: "URL of the compressed FastDeploy code archive."
+        required: true
+        type: string
+      FASTDEPLOY_WHEEL_URL:
+        description: "URL of the FastDeploy Wheel."
+        required: true
+        type: string
+      CACHE_DIR:
+        description: "Cache Dir Use"
+        required: false
+        type: string
+        default: ""
+      MODEL_CACHE_DIR:
+        description: "Cache Dir Use"
+        required: false
+        type: string
+        default: ""
+
+jobs:
+  accuracy_tests:
+    runs-on: [self-hosted, GPU-h20-1Cards]
+    steps:
+      - name: Code Prepare
+        shell: bash
+        env:
+          docker_image: ${{ inputs.DOCKER_IMAGE }}
+          fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
+        run: |
+            set -x
+            REPO="https://github.com/${{ github.repository }}.git"
+            FULL_REPO="${{ github.repository }}"
+            REPO_NAME="${FULL_REPO##*/}"
+            BASE_BRANCH="${{ github.base_ref }}"
+
+            # Clean the repository directory before starting
+            docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
+            -e "REPO_NAME=${REPO_NAME}" \
+            ${docker_image} /bin/bash -c '
+              if [ -d ${REPO_NAME} ]; then
+                echo "Directory ${REPO_NAME} exists, removing it..."
+                rm -rf ${REPO_NAME}*
+              fi
+            '
+
+            wget -q ${fd_archive_url}
+            tar -xf FastDeploy.tar.gz
+            rm -rf FastDeploy.tar.gz
+            cd FastDeploy
+            git config --global user.name "FastDeployCI"
+            git config --global user.email "fastdeploy_ci@example.com"
+            git log -n 3 --oneline
+
+      - name: Run FastDeploy Base Tests
+        shell: bash
+        env:
+          docker_image: ${{ inputs.DOCKER_IMAGE }}
+          fastdeploy_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
+          CACHE_DIR: ${{ inputs.CACHE_DIR }}
+          MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
+        run: |
+          runner_name="${{ runner.name }}"
+          CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
+          DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
+          DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
+
+          FLASK_PORT=$((42068 + DEVICE_PORT * 100))
+          FD_API_PORT=$((42088 + DEVICE_PORT * 100))
+          FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
+          FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
+          echo "Test ENV Parameter:"
+          echo "========================================================="
+          echo "FLASK_PORT=${FLASK_PORT}"
+          echo "FD_API_PORT=${FD_API_PORT}"
+          echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
+          echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
+          echo "DEVICES=${DEVICES}"
+          echo "========================================================="
+
+          CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
+          echo "CACHE_DIR is set to ${CACHE_DIR}"
+          if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
+            touch "${CACHE_DIR}/gitconfig"
+          fi
+          if [ ! -d "${MODEL_CACHE_DIR}" ]; then
+            echo "Error: MODEL_CACHE_DIR '${MODEL_CACHE_DIR}' does not exist."
+            exit 1
+          fi
+
+          PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT)
+          LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
+          echo "==== LOG_FILE is ${LOG_FILE} ===="
+
+          echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
+
+          for port in "${PORTS[@]}"; do
+              PIDS=$(lsof -t -i :$port || true)
+              if [ -n "$PIDS" ]; then
+                  echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
+                  echo "$PIDS" | xargs -r kill -9
+                  echo "Port $port cleared" | tee -a $LOG_FILE
+              else
+                  echo "Port $port is free" | tee -a $LOG_FILE
+              fi
+          done
+
+          echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
+
+          docker run --rm --ipc=host --pid=host --net=host \
+          -v $(pwd):/workspace \
+          -w /workspace \
+          -e fastdeploy_wheel_url=${fastdeploy_wheel_url} \
+          -e "FD_API_PORT=${FD_API_PORT}" \
+          -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
+          -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
+          -e "FLASK_PORT=${FLASK_PORT}" \
+          -v "${MODEL_CACHE_DIR}:/MODELDATA" \
+          -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
+          -v "${CACHE_DIR}/.cache:/root/.cache" \
+          -v "${CACHE_DIR}/ConfigDir:/root/.config" \
+          -e TZ="Asia/Shanghai" \
+          --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
+          python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+
+          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+          python -m pip install ${fastdeploy_wheel_url}
+          python -m pip install pytest
+
+          wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
+          chmod +x ./llm-deploy-linux-amd64
+          ./llm-deploy-linux-amd64 -python python3.10 \
+          -model_name ERNIE-4.5-0.3B-Paddle \
+          -model_path /MODELDATA \
+          --skip install
+
+          git config --global --add safe.directory /workspace/FastDeploy
+          cd FastDeploy
+          pushd test/ce/deploy
+          python3.10 deploy.py > dd.log 2>&1 &
+          sleep 3
+          curl -X POST http://0.0.0.0:${FLASK_PORT}/start \
+            -H "Content-Type: application/json" \
+            -d "{\"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\"}"
+
+          curl -X POST http://localhost:${FLASK_PORT}/wait_for_infer?timeout=90
+          popd
+
+          pushd test/ce/accuracy_cases
+          export URL=http://localhost:${FD_API_PORT}/v1/chat/completions
+          export TEMPLATE=TOKEN_LOGPROB
+          export MODEL_SIZE=0.3B
+          TEST_EXIT_CODE=0
+          python gsm8k.py || TEST_EXIT_CODE=1
+          popd
+          echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> /workspace/FastDeploy/exit_code.env
+          '
+          if [ -f ./FastDeploy/exit_code.env ]; then
+            source ./FastDeploy/exit_code.env
+            cat ./FastDeploy/exit_code.env >> $GITHUB_ENV
+          fi
+          echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}"
+          exit ${TEST_EXIT_CODE}
@@ -73,3 +73,13 @@ jobs:
       FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
       FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
       MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
+
+  accuracy_test:
+    name: Run Accuracy Tests
+    needs: [clone,build]
+    uses: ./.github/workflows/_accuracy_test.yml
+    with:
+      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
+      FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
+      FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
+      MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
@@ -23,13 +23,11 @@ English | [简体中文](README_CN.md)
 </p>
 
 --------------------------------------------------------------------------------
-# FastDeploy 2.1: Inference and Deployment Toolkit for LLMs and VLMs based on PaddlePaddle
+# FastDeploy : Inference and Deployment Toolkit for LLMs and VLMs based on PaddlePaddle
 
 ## News
 **[2025-08] 🔥 Released FastDeploy v2.1:** A brand-new KV Cache scheduling strategy has been introduced, and expanded support for PD separation and CUDA Graph across more models. Enhanced hardware support has been added for platforms like Kunlun and Hygon, along with comprehensive optimizations to improve the performance of both the service and inference engine.
 
-**[2025-07] 《FastDeploy2.0推理部署实测》专题活动已上线!** 完成文心4.5系列开源模型的推理部署等任务，即可获得骨瓷马克杯等FastDeploy2.0官方周边及丰富奖金！🎁 欢迎大家体验反馈～ 📌[报名地址](https://www.wjx.top/vm/meSsp3L.aspx#)   📌[活动详情](https://github.com/PaddlePaddle/FastDeploy/discussions/2728)
-
 **[2025-07] The FastDeploy 2.0 Inference Deployment Challenge is now live!** Complete the inference deployment task for the ERNIE 4.5 series open-source models to win official FastDeploy 2.0 merch and generous prizes! 🎁 You're welcome to try it out and share your feedback! 📌[Sign up here](https://www.wjx.top/vm/meSsp3L.aspx#) 📌[Event details](https://github.com/PaddlePaddle/FastDeploy/discussions/2728)
 
 **[2025-06] 🔥 Released FastDeploy v2.0:** Supports inference and deployment for ERNIE 4.5. Furthermore, we open-source an industrial-grade PD disaggregation with context caching, dynamic role switching for effective resource utilization to further enhance inference performance for MoE models.
@@ -52,14 +50,15 @@ English | [简体中文](README_CN.md)
 
 ## Installation
 
-FastDeploy supports inference deployment on **NVIDIA GPUs**, **Kunlunxin XPUs**, **Iluvatar GPUs**, **Enflame GCUs**, and other hardware. For detailed installation instructions:
+FastDeploy supports inference deployment on **NVIDIA GPUs**, **Kunlunxin XPUs**, **Iluvatar GPUs**, **Enflame GCUs**, **Hygon DCUs** and other hardware. For detailed installation instructions:
 
 - [NVIDIA GPU](./docs/get_started/installation/nvidia_gpu.md)
 - [Kunlunxin XPU](./docs/get_started/installation/kunlunxin_xpu.md)
 - [Iluvatar GPU](./docs/get_started/installation/iluvatar_gpu.md)
 - [Enflame GCU](./docs/get_started/installation/Enflame_gcu.md)
+- [Hygon DCU](./docs/get_started/installation/hygon_dcu.md)
 
-**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU, Hygon DCU, and MetaX GPU are currently under development and testing. Stay tuned for updates!
+**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU and MetaX GPU are currently under development and testing. Stay tuned for updates!
 
 ## Get Started
 
 
@@ -23,7 +23,7 @@
 </p>
 
 --------------------------------------------------------------------------------
-# FastDeploy 2.1：基于飞桨的大语言模型与视觉语言模型推理部署工具包
+# FastDeploy ：基于飞桨的大语言模型与视觉语言模型推理部署工具包
 
 ## 最新活动
 **[2025-08] 🔥 FastDeploy v2.1 全新发布:** 全新的KV Cache调度策略，更多模型支持PD分离和CUDA Graph，昆仑、海光等更多硬件支持增强，全方面优化服务和推理引擎的性能。
@@ -48,14 +48,15 @@
 
 ## 安装
 
-FastDeploy 支持在**英伟达（NVIDIA）GPU**、**昆仑芯（Kunlunxin）XPU**、**天数（Iluvatar）GPU**、**燧原（Enflame）GCU** 以及其他硬件上进行推理部署。详细安装说明如下：
+FastDeploy 支持在**英伟达（NVIDIA）GPU**、**昆仑芯（Kunlunxin）XPU**、**天数（Iluvatar）GPU**、**燧原（Enflame）GCU**、**海光（Hygon）DCU** 以及其他硬件上进行推理部署。详细安装说明如下：
 
 - [英伟达 GPU](./docs/zh/get_started/installation/nvidia_gpu.md)
 - [昆仑芯 XPU](./docs/zh/get_started/installation/kunlunxin_xpu.md)
 - [天数 CoreX](./docs/zh/get_started/installation/iluvatar_gpu.md)
 - [燧原 S60](./docs/zh/get_started/installation/Enflame_gcu.md)
+- [海光 DCU](./docs/zh/get_started/installation/hygon_dcu.md)
 
-**注意:** 我们正在积极拓展硬件支持范围。目前，包括昇腾（Ascend）NPU、海光（Hygon）DCU 和摩尔线程（MetaX）GPU 在内的其他硬件平台正在开发测试中。敬请关注更新！
+**注意:** 我们正在积极拓展硬件支持范围。目前，包括昇腾（Ascend）NPU 和 沐曦（MetaX）GPU 在内的其他硬件平台正在开发测试中。敬请关注更新！
 
 ## 入门指南
 
 
@@ -1,4 +1,4 @@
-FROM ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.0.0
+FROM ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.1.0
 ARG PADDLE_VERSION=3.1.1
 ARG FD_VERSION=2.1.0
 
 
@@ -76,7 +76,7 @@ Add the following lines to the startup parameters
 --use-cudagraph
 ```
 Notes:
-1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../parameters.md) for related configuration parameter descriptions
+1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../features/graph_optimization.md) for related configuration parameter descriptions
 2. When CUDAGraph is enabled, if running with multi-GPUs TP>1, `--enable-custom-all-reduce` must be specified at the same time.
 3. When CUDAGraph is enabled, the scenario of `max-model-len > 32768` is not currently supported.
 
 
@@ -86,7 +86,7 @@ Add the following lines to the startup parameters
 --use-cudagraph
 ```
 Notes:
-1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../parameters.md) for related configuration parameter descriptions
+1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../features/graph_optimization.md) for related configuration parameter descriptions
 2. When CUDAGraph is enabled, if running with multi-GPUs TP>1, `--enable-custom-all-reduce` must be specified at the same time.
 3. When CUDAGraph is enabled, the scenario of `max-model-len > 32768` is not currently supported.
 
@@ -111,7 +111,6 @@ export INFERENCE_MSG_QUEUE_ID=1315
 export FLAGS_max_partition_size=2048
 export FD_ATTENTION_BACKEND=FLASH_ATTN
 export FD_LOG_DIR="prefill_log"
-export ENABLE_V1_KVCACHE_SCHEDULER=1
 
 quant_type=block_wise_fp8
 export FD_USE_DEEP_GEMM=0
@@ -121,7 +120,7 @@ python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A
     --max-num-seqs 20 \
     --num-gpu-blocks-override 40000 \
     --quantization ${quant_type} \
-    --gpu-memory-utilization 0.9 \
+    --gpu-memory-utilization 0.9 --kv-cache-ratio 0.9 \
     --port 7012 --engine-worker-queue-port 7013 --metrics-port 7014 --tensor-parallel-size 4 \
     --cache-queue-port 7015 \
     --splitwise-role "prefill" \
@@ -132,7 +131,6 @@ export CUDA_VISIBLE_DEVICES=4,5,6,7
 export INFERENCE_MSG_QUEUE_ID=1215
 export FLAGS_max_partition_size=2048
 export FD_LOG_DIR="decode_log"
-export ENABLE_V1_KVCACHE_SCHEDULER=1
 
 quant_type=block_wise_fp8
 export FD_USE_DEEP_GEMM=0
@@ -141,7 +139,7 @@ python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A
     --max-model-len 131072 \
     --max-num-seqs 20 \
     --quantization ${quant_type} \
-    --gpu-memory-utilization 0.85 \
+    --gpu-memory-utilization 0.85 --kv-cache-ratio 0.1 \
     --port 9012 --engine-worker-queue-port 8013 --metrics-port 8014 --tensor-parallel-size 4 \
     --cache-queue-port 8015 \
     --innode-prefill-ports 7013 \
 
@@ -99,7 +99,6 @@ export FD_SAMPLING_CLASS=rejection
 **How to enable:** Take the deployment of a single machine with 8 GPUs and 1P1D (4 GPUs each) as an example. Compared with the default hybrid deployment method, `--splitwise-role` is required to specify the role of the node. And the GPUs and logs of the two nodes are isolated through the environment variables `FD_LOG_DIR` and `CUDA_VISIBLE_DEVICES`.
 ```
 export FD_LOG_DIR="log_prefill"
-export ENABLE_V1_KVCACHE_SCHEDULER=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 python -m fastdeploy.entrypoints.openai.api_server \
        --model baidu/ERNIE-4.5-300B-A47B-Paddle \
@@ -112,7 +111,6 @@ python -m fastdeploy.entrypoints.openai.api_server \
 ```
 ```
 export FD_LOG_DIR="log_decode"
-export ENABLE_V1_KVCACHE_SCHEDULER=1
 export CUDA_VISIBLE_DEVICES=4,5,6,7
 # Note that innode-prefill-ports is specified as the Prefill serviceengine-worker-queue-port
 python -m fastdeploy.entrypoints.openai.api_server \
@@ -137,7 +135,7 @@ Add the following lines to the startup parameters
 --enable-custom-all-reduce
 ```
 Notes:
-1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../parameters.md) for related configuration parameter descriptions
+1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../features/graph_optimization.md) for related configuration parameter descriptions
 2. When CUDAGraph is enabled, if running with multi-GPUs TP>1, `--enable-custom-all-reduce` must be specified at the same time.
 3. When CUDAGraph is enabled, the scenario of `max-model-len > 32768` is not currently supported.
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.0.0`
	`1`	`+FROM ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.1.0`
`2`	`2`	`ARG PADDLE_VERSION=3.1.1`
`3`	`3`	`ARG FD_VERSION=2.1.0`
`4`	`4`