diff --git a/.github/workflows/_base_test.yml b/.github/workflows/_base_test.yml
new file mode 100644
index 0000000000..6e62e9fb47
--- /dev/null
+++ b/.github/workflows/_base_test.yml
@@ -0,0 +1,162 @@
+name: Base Test
+description: "Run Base Tests"
+
+on:
+  workflow_call:
+    inputs:
+      DOCKER_IMAGE:
+        description: "Build Images"
+        required: true
+        type: string
+        default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310"
+      FASTDEPLOY_ARCHIVE_URL:
+        description: "URL of the compressed FastDeploy code archive."
+        required: true
+        type: string
+      FASTDEPLOY_WHEEL_URL:
+        description: "URL of the FastDeploy Wheel."
+        required: true
+        type: string
+      CACHE_DIR:
+        description: "Cache Dir Use"
+        required: false
+        type: string
+        default: ""
+      MODEL_CACHE_DIR:
+        description: "Cache Dir Use"
+        required: false
+        type: string
+        default: ""
+
+jobs:
+  base_tests:
+    runs-on: [self-hosted, GPU-h20-1Cards]
+    steps:
+      - name: Code Prepare
+        shell: bash
+        env:
+          docker_image: ${{ inputs.DOCKER_IMAGE }}
+          fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
+        run: |
+            set -x
+            REPO="https://github.com/${{ github.repository }}.git"
+            FULL_REPO="${{ github.repository }}"
+            REPO_NAME="${FULL_REPO##*/}"
+            BASE_BRANCH="${{ github.base_ref }}"
+
+            # Clean the repository directory before starting
+            docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
+            -e "REPO_NAME=${REPO_NAME}" \
+            ${docker_image} /bin/bash -c '
+              if [ -d ${REPO_NAME} ]; then
+                echo "Directory ${REPO_NAME} exists, removing it..."
+                rm -rf ${REPO_NAME}*
+              fi
+            '
+
+            wget -q ${fd_archive_url}
+            tar -xf FastDeploy.tar.gz
+            rm -rf FastDeploy.tar.gz
+            cd FastDeploy
+            git config --global user.name "FastDeployCI"
+            git config --global user.email "fastdeploy_ci@example.com"
+            git log -n 3 --oneline
+
+      - name: Run FastDeploy Base Tests
+        shell: bash
+        env:
+          docker_image: ${{ inputs.DOCKER_IMAGE }}
+          fastdeploy_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
+          CACHE_DIR: ${{ inputs.CACHE_DIR }}
+          MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
+        run: |
+            runner_name="${{ runner.name }}"
+            last_char="${runner_name: -1}"
+
+            if [[ "$last_char" =~ [0-7] ]]; then
+              DEVICES="$last_char"
+            else
+              DEVICES="0"
+            fi
+
+            FLASK_PORT=$((42068 + DEVICES * 100))
+            FD_API_PORT=$((42088 + DEVICES * 100))
+            FD_ENGINE_QUEUE_PORT=$((42058 + DEVICES * 100))
+            FD_METRICS_PORT=$((42078 + DEVICES * 100))
+            echo "Test ENV Parameter:"
+            echo "========================================================="
+            echo "FLASK_PORT=${FLASK_PORT}"
+            echo "FD_API_PORT=${FD_API_PORT}"
+            echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
+            echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
+            echo "DEVICES=${DEVICES}"
+            echo "========================================================="
+
+            CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
+            echo "CACHE_DIR is set to ${CACHE_DIR}"
+            if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
+              touch "${CACHE_DIR}/gitconfig"
+            fi
+            if [ ! -d "${MODEL_CACHE_DIR}" ]; then
+              echo "Error: MODEL_CACHE_DIR '${MODEL_CACHE_DIR}' does not exist."
+              exit 1
+            fi
+
+            PARENT_DIR=$(dirname "$WORKSPACE")
+
+            docker run --rm --ipc=host --pid=host --net=host \
+            -v $(pwd):/workspace \
+            -w /workspace \
+            -e fastdeploy_wheel_url=${fastdeploy_wheel_url} \
+            -e "FD_API_PORT=${FD_API_PORT}" \
+            -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
+            -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
+            -e "FLASK_PORT=${FLASK_PORT}" \
+            -v "${MODEL_CACHE_DIR}:/MODELDATA" \
+            -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
+            -v "${CACHE_DIR}/.cache:/root/.cache" \
+            -v "${CACHE_DIR}/ConfigDir:/root/.config" \
+            -e TZ="Asia/Shanghai" \
+            --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
+            # python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+            python -m pip install paddlepaddle-gpu==3.0.0.dev20250729 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+
+            pip config set global.index-url http://pip.baidu.com/root/baidu/+simple/
+            pip config set install.trusted-host  pip.baidu.com
+            pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+            python -m pip install ${fastdeploy_wheel_url}
+            python -m pip install pytest
+
+            wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
+            chmod +x ./llm-deploy-linux-amd64
+            ./llm-deploy-linux-amd64 -python python3.10 \
+            -model_name ERNIE-4.5-0.3B-Paddle \
+            -model_path /MODELDATA \
+            --skip install
+
+            git config --global --add safe.directory /workspace/FastDeploy
+            cd FastDeploy
+            pushd test/ce/deploy
+            python3.10 deploy.py > dd.log 2>&1 &
+            sleep 3
+            curl -X POST http://0.0.0.0:${FLASK_PORT}/start \
+              -H "Content-Type: application/json" \
+              -d "{\"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\"}"
+
+            curl -X POST http://localhost:${FLASK_PORT}/wait_for_infer?timeout=90
+            popd
+
+            pushd test/ce/server
+            export URL=http://localhost:${FD_API_PORT}/v1/chat/completions
+            export TEMPLATE=TOKEN_LOGPROB
+            TEST_EXIT_CODE=0
+            python -m pytest -sv . || TEST_EXIT_CODE=$?
+            popd
+            echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> /workspace/FastDeploy/exit_code.env
+            '
+            if [ -f ./FastDeploy/exit_code.env ]; then
+              source ./FastDeploy/exit_code.env
+              cat ./FastDeploy/exit_code.env >> $GITHUB_ENV
+            fi
+            echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}"
+            exit ${TEST_EXIT_CODE}
diff --git a/.github/workflows/_clone_linux.yml b/.github/workflows/_clone_linux.yml
index 34ee2343ee..5efdba50cc 100644
--- a/.github/workflows/_clone_linux.yml
+++ b/.github/workflows/_clone_linux.yml
@@ -68,7 +68,7 @@ jobs:
             branch_name=${{ github.ref_name }}
             target_path=paddle-github-action/BRANCH/FastDeploy/${branch_name}/${commit_id}
           fi
-          wget  -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
+          wget -O bos_tools.py -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
           push_file=$(realpath bos_tools.py)
           python -m pip install bce-python-sdk==0.9.29
           ls
diff --git a/.github/workflows/_logprob_test_linux.yml b/.github/workflows/_logprob_test_linux.yml
index 3a6aff7de1..366beaecbb 100644
--- a/.github/workflows/_logprob_test_linux.yml
+++ b/.github/workflows/_logprob_test_linux.yml
@@ -70,10 +70,18 @@ jobs:
             DEVICES="0"
           fi
 
-          FLASK_PORT=$((9160 + DEVICES * 100))
-          FD_API_PORT=$((9180 + DEVICES * 100))
-          FD_ENGINE_QUEUE_PORT=$((9150 + DEVICES * 100))
-          FD_METRICS_PORT=$((9170 + DEVICES * 100))
+          FLASK_PORT=$((42068 + DEVICES * 100))
+          FD_API_PORT=$((42088 + DEVICES * 100))
+          FD_ENGINE_QUEUE_PORT=$((42058 + DEVICES * 100))
+          FD_METRICS_PORT=$((42078 + DEVICES * 100))
+          echo "Test ENV Parameter:"
+          echo "========================================================="
+          echo "FLASK_PORT=${FLASK_PORT}"
+          echo "FD_API_PORT=${FD_API_PORT}"
+          echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
+          echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
+          echo "DEVICES=${DEVICES}"
+          echo "========================================================="
 
           CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
           echo "CACHE_DIR is set to ${CACHE_DIR}"
@@ -86,8 +94,10 @@ jobs:
           fi
 
           PARENT_DIR=$(dirname "$WORKSPACE")
+          unset http_proxy
+          unset https_proxy
 
-          docker run --ipc=host --pid=host --net=host \
+          docker run --rm --ipc=host --pid=host --net=host \
           -v $(pwd):/workspace \
           -w /workspace \
           -e fastdeploy_wheel_url=${fastdeploy_wheel_url} \
@@ -100,7 +110,7 @@ jobs:
           -v "${CACHE_DIR}/.cache:/root/.cache" \
           -v "${CACHE_DIR}/ConfigDir:/root/.config" \
           -e TZ="Asia/Shanghai" \
-          --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -c '
+          --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
           # python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
           python -m pip install paddlepaddle-gpu==3.0.0.dev20250729 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
 
@@ -124,6 +134,10 @@ jobs:
               -d "{\"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\"}"
 
           curl -X POST http://localhost:${FLASK_PORT}/wait_for_infer?timeout=90
+          curl -s -o /dev/null -w "%{http_code}" -m 2 "http://0.0.0.0:${FD_API_PORT}/health"
+          curl -X POST "http://0.0.0.0:${FD_API_PORT}/v1/chat/completions" \
+            -H "Content-Type: application/json" \
+            -d "{\"messages\": [{\"role\": \"user\", \"content\": \"1+1=?\"}], \"logprobs\": true}"
           set +e
           rm -rf ./baseline_output
           cp -r baseline/ERNIE-4.5-0.3B-Paddle ./baseline_output
diff --git a/.github/workflows/pr_build_and_test.yml b/.github/workflows/pr_build_and_test.yml
index 0123e5a554..7ba2e7f3ef 100644
--- a/.github/workflows/pr_build_and_test.yml
+++ b/.github/workflows/pr_build_and_test.yml
@@ -19,7 +19,7 @@ jobs:
     needs: clone
     uses: ./.github/workflows/_build_linux.yml
     with:
-      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310
+      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
       FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
       COMPILE_ARCH: "89,90"
       WITH_NIGHTLY_BUILD: "OFF"
@@ -39,7 +39,7 @@ jobs:
     needs: [clone,build]
     uses: ./.github/workflows/_unit_test_coverage.yml
     with:
-      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310
+      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
       FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
       FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
 
@@ -48,7 +48,7 @@ jobs:
     needs: [build]
     uses: ./.github/workflows/_logprob_test_linux.yml
     with:
-      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310
+      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
       PADDLETEST_ARCHIVE_URL: "https://xly-devops.bj.bcebos.com/PaddleTest/PaddleTest.tar.gz"
       FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
       MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelCache"
@@ -61,3 +61,13 @@ jobs:
       DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:fastdeploy-ciuse-cuda126
       FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
       FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
+
+  base_test:
+    name: Run Base Tests
+    needs: [clone,build]
+    uses: ./.github/workflows/_base_test.yml
+    with:
+      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
+      FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
+      FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
+      MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelCache"
diff --git a/test/ce/deploy/deploy.py b/test/ce/deploy/deploy.py
new file mode 100644
index 0000000000..5ec7e1f22a
--- /dev/null
+++ b/test/ce/deploy/deploy.py
@@ -0,0 +1,456 @@
+import ast
+import json
+import os
+import re
+import signal
+import socket
+import subprocess
+import sys
+import time
+
+import requests
+import yaml
+from flask import Flask, Response, jsonify, request
+
+app = Flask(__name__)
+
+
+def get_base_port():
+    nv_visible_devices = os.environ.get("NVIDIA_VISIBLE_DEVICES", "")
+    if not nv_visible_devices or nv_visible_devices.lower() == "all":
+        return 8000
+    # 提取第一个数字
+    match = re.search(r"\d+", nv_visible_devices)
+    if match:
+        return int(match.group(0)) * 100 + 8000
+    return 8000
+
+
+# 默认参数值
+PID_FILE = "pid_port"
+LOG_FILE = "server.log"
+base_port = get_base_port()
+FLASK_PORT = int(os.environ.get("FLASK_PORT", base_port + 1))
+FD_API_PORT = int(os.environ.get("FD_API_PORT", base_port + 2))
+FD_ENGINE_QUEUE_PORT = int(os.environ.get("FD_ENGINE_QUEUE_PORT", base_port + 3))
+FD_METRICS_PORT = int(os.environ.get("FD_METRICS_PORT", base_port + 4))
+DEFAULT_PARAMS = {
+    "--port": FD_API_PORT,
+    "--engine-worker-queue-port": FD_ENGINE_QUEUE_PORT,
+    "--metrics-port": FD_METRICS_PORT,
+    "--enable-logprob": True,
+}
+
+
+def build_command(config):
+    """根据配置构建启动命令"""
+    # 基础命令
+    cmd = [
+        "python",
+        "-m",
+        "fastdeploy.entrypoints.openai.api_server",
+    ]
+
+    # 添加配置参数
+    for key, value in config.items():
+        if "--enable" in key:
+            if value:
+                cmd.append(key)
+        else:
+            cmd.extend([key, str(value)])
+
+    return cmd
+
+
+def merge_configs(base_config, override_config):
+    """合并配置，优先级：override_config > base_config"""
+    merged = base_config.copy()
+
+    if override_config:
+        for key in override_config:
+            merged[key] = override_config[key]
+
+    return merged
+
+
+def is_port_in_use(port):
+    """检查端口是否被占用"""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        return s.connect_ex(("localhost", port)) == 0
+
+
+def get_server_pid():
+    """获取服务进程ID PORT"""
+    if os.path.exists(PID_FILE):
+        with open(PID_FILE, "r") as f:
+            data = yaml.safe_load(f)
+            return data
+    return None
+
+
+def is_server_running():
+    """检查服务是否正在运行"""
+    pid_port = get_server_pid()
+    if pid_port is None:
+        return False, {"status": "Server not running..."}
+
+    _, port = pid_port["PID"], pid_port["PORT"]
+    health_check_endpoint = f"http://0.0.0.0:{port}/health"
+
+    if os.path.exists(LOG_FILE):
+        with open(LOG_FILE, "r") as f:
+            msg = f.readlines()
+    result = parse_tqdm_progress(msg)
+
+    try:
+        response = requests.get(health_check_endpoint, timeout=2)
+        return response.status_code == 200, result
+    except requests.exceptions.RequestException:
+        return False, result
+
+
+def parse_tqdm_progress(log_lines):
+    """
+    解析 tqdm 风格的进度条
+    """
+    tqdm_pattern = re.compile(
+        r"(?P<prefix>.+?):\s+(?P<percent>\d+)%\|(?P<bar>.+?)\|\s+(?P<step>\d+/\d+)\s+\[(?P<elapsed>\d+:\d+)<(?P<eta>\d+:\d+),\s+(?P<speed>[\d\.]+it/s)\]"
+    )
+
+    for line in reversed(log_lines):
+        match = tqdm_pattern.search(line)
+        if match:
+            data = match.groupdict()
+            return {
+                "status": "服务启动中",
+                "progress": {
+                    "percent": int(data["percent"]),
+                    "step": data["step"],
+                    "speed": data["speed"],
+                    "eta": data["eta"],
+                    "elapsed": data["elapsed"],
+                    "bar": data["bar"].strip(),
+                },
+                "raw_line": line.strip(),
+            }
+    return {"status": "服务启动中", "progress": {}, "raw_line": log_lines[-1] if log_lines else "server.log为空"}
+
+
+def stop_server(signum=None, frame=None):
+    """停止大模型推理服务"""
+    pid_port = get_server_pid()
+    if pid_port is None:
+        if signum:
+            sys.exit(0)
+        return jsonify({"status": "error", "message": "Service is not running"}), 400
+
+    server_pid, _ = pid_port["PID"], pid_port["PORT"]
+
+    # 清理PID文件
+    if os.path.exists(PID_FILE):
+        os.remove(PID_FILE)
+    if os.path.exists("gemm_profiles.json"):
+        os.remove("gemm_profiles.json")
+
+    try:
+        # 终止进程组（包括所有子进程）
+        os.killpg(os.getpgid(pid_port["PID"]), signal.SIGTERM)
+    except Exception as e:
+        print(f"Failed to stop server: {e}")
+
+        for port in [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT]:
+            try:
+                output = subprocess.check_output(f"lsof -i:{port} -t", shell=True).decode().strip()
+                for pid in output.splitlines():
+                    os.kill(int(pid), signal.SIGKILL)
+                    print(f"Killed process on port {port}, pid={pid}")
+            except Exception as e:
+                print(f"Failed to killed process on port: {e}")
+    # 若log目录存在，则重命名为log_timestamp
+    if os.path.isdir("./log"):
+        os.rename("./log", "./log_{}".format(time.strftime("%Y%m%d%H%M%S")))
+
+    if signum:
+        sys.exit(0)
+
+    return jsonify({"status": "success", "message": "Service stopped", "pid": server_pid}), 200
+
+
+# 捕获 SIGINT (Ctrl+C) 和 SIGTERM (kill)
+signal.signal(signal.SIGINT, stop_server)
+signal.signal(signal.SIGTERM, stop_server)
+
+
+@app.route("/start", methods=["POST"])
+def start_service():
+    """启动大模型推理服务"""
+    # 检查服务是否已在运行
+    if is_server_running()[0]:
+        return Response(
+            json.dumps({"status": "error", "message": "服务已启动，无需start"}, ensure_ascii=False),
+            status=400,
+            content_type="application/json",
+        )
+
+    try:
+        base_config = DEFAULT_PARAMS
+
+        override_config = request.get_json() or {}
+
+        final_config = merge_configs(base_config, override_config)
+
+        global FD_API_PORT
+        global FD_ENGINE_QUEUE_PORT
+        global FD_METRICS_PORT
+        FD_API_PORT = final_config["--port"]
+        FD_ENGINE_QUEUE_PORT = final_config["--engine-worker-queue-port"]
+        FD_METRICS_PORT = final_config["--metrics-port"]
+
+        # 构建命令
+        cmd = build_command(final_config)
+    except Exception as e:
+        return Response(
+            json.dumps({"status": "error", "message": str(e)}, ensure_ascii=False),
+            status=500,
+            content_type="application/json",
+        )
+
+    print("cmd", cmd)
+
+    try:
+        # 设置环境变量并启动进程
+        env = os.environ.copy()
+
+        with open(LOG_FILE, "w") as log:
+            process = subprocess.Popen(cmd, stdout=log, stderr=log, env=env, start_new_session=True)
+
+        # 保存进程ID,port到yaml文件
+        with open(PID_FILE, "w") as f:
+            yaml.dump({"PID": process.pid, "PORT": final_config["--port"]}, f)
+
+        json_data = {
+            "status": "success",
+            "message": "服务启动命令已执行",
+            "pid": process.pid,
+            "config": final_config,
+            "log_file": LOG_FILE,
+            "cmd": cmd,
+            "port_info": {
+                "api_port": FD_API_PORT,
+                "queue_port": FD_ENGINE_QUEUE_PORT,
+                "metrics_port": FD_METRICS_PORT,
+            },
+        }
+
+        return Response(json.dumps(json_data, ensure_ascii=False), status=200, content_type="application/json")
+    except Exception as e:
+        return Response(
+            json.dumps({"status": "error", "message": str(e)}, ensure_ascii=False),
+            status=500,
+            content_type="application/json",
+        )
+
+
+@app.route("/switch", methods=["POST"])
+def switch_service():
+    """切换模型服务"""
+    # kill掉已有服务
+    stop_server()
+    time.sleep(2)
+
+    try:
+        base_config = DEFAULT_PARAMS
+
+        override_config = request.get_json() or {}
+
+        final_config = merge_configs(base_config, override_config)
+
+        global FD_API_PORT
+        global FD_ENGINE_QUEUE_PORT
+        global FD_METRICS_PORT
+        FD_API_PORT = final_config["--port"]
+        FD_ENGINE_QUEUE_PORT = final_config["--engine-worker-queue-port"]
+        FD_METRICS_PORT = final_config["--metrics-port"]
+
+        # 构建命令
+        cmd = build_command(final_config)
+    except Exception as e:
+        return Response(
+            json.dumps({"status": "error", "message": str(e)}, ensure_ascii=False),
+            status=500,
+            content_type="application/json",
+        )
+
+    print("cmd", cmd)
+
+    try:
+        # 设置环境变量并启动进程
+        env = os.environ.copy()
+
+        with open(LOG_FILE, "w") as log:
+            process = subprocess.Popen(cmd, stdout=log, stderr=log, env=env, start_new_session=True)
+
+        # 保存进程ID,port到yaml文件
+        with open(PID_FILE, "w") as f:
+            yaml.dump({"PID": process.pid, "PORT": final_config["--port"]}, f)
+
+        json_data = {
+            "status": "success",
+            "message": "服务启动命令已执行",
+            "pid": process.pid,
+            "config": final_config,
+            "log_file": LOG_FILE,
+            "cmd": cmd,
+            "port_info": {
+                "api_port": FD_API_PORT,
+                "queue_port": FD_ENGINE_QUEUE_PORT,
+                "metrics_port": FD_METRICS_PORT,
+            },
+        }
+
+        return Response(json.dumps(json_data, ensure_ascii=False), status=200, content_type="application/json")
+    except Exception as e:
+        return Response(
+            json.dumps({"status": "error", "message": str(e)}, ensure_ascii=False),
+            status=500,
+            content_type="application/json",
+        )
+
+
+@app.route("/status", methods=["GET", "POST"])
+def service_status():
+    """检查服务状态"""
+    health, msg = is_server_running()
+
+    if not health:
+        return Response(json.dumps(msg, ensure_ascii=False), status=500, content_type="application/json")
+
+    # 检查端口是否监听
+    ports_status = {
+        "api_port": FD_API_PORT if is_port_in_use(FD_API_PORT) else None,
+        "queue_port": FD_ENGINE_QUEUE_PORT if is_port_in_use(FD_ENGINE_QUEUE_PORT) else None,
+        "metrics_port": FD_METRICS_PORT if is_port_in_use(FD_METRICS_PORT) else None,
+    }
+
+    msg["status"] = "服务启动完成"
+    msg["ports_status"] = ports_status
+
+    return Response(json.dumps(msg, ensure_ascii=False), status=200, content_type="application/json")
+
+
+@app.route("/stop", methods=["POST"])
+def stop_service():
+    """停止大模型推理服务"""
+    res, status_code = stop_server()
+
+    return res, status_code
+
+
+@app.route("/config", methods=["GET"])
+def get_config():
+    """获取当前server配置"""
+    health, msg = is_server_running()
+
+    if not health:
+        return Response(json.dumps(msg, ensure_ascii=False), status=500, content_type="application/json")
+
+    if not os.path.exists("log/api_server.log"):
+        return Response(
+            json.dumps({"message": "api_server.log不存在"}, ensure_ascii=False),
+            status=500,
+            content_type="application/json",
+        )
+
+    try:
+        # 筛选出包含"args:"的行
+        with open("log/api_server.log", "r") as f:
+            lines = [line for line in f.readlines() if "args:" in line]
+
+        last_line = lines[-1] if lines else ""
+
+        # 使用正则表达式提取JSON格式的配置
+        match = re.search(r"args\s*[:：]\s*(.*)", last_line)
+        if not match:
+            return Response(
+                json.dumps({"message": "api_server.log中没有args信息，请检查log"}, ensure_ascii=False),
+                status=500,
+                content_type="application/json",
+            )
+
+        # 尝试解析JSON
+        config_json = match.group(1).strip()
+        config_data = ast.literal_eval(config_json)
+        print("config_data", config_data, type(config_data))
+        return Response(
+            json.dumps({"server_config": config_data}, ensure_ascii=False), status=200, content_type="application/json"
+        )
+
+    except Exception as e:
+        return Response(
+            json.dumps({"message": "api_server.log解析失败，请检查log", "error": str(e)}, ensure_ascii=False),
+            status=500,
+            content_type="application/json",
+        )
+
+
+@app.route("/wait_for_infer", methods=["POST"])
+def wait_for_infer():
+    timeout = int(request.args.get("timeout", 120))  # 可选超时时间，默认120秒
+    interval = 2
+    response_interval = 10
+    start_time = time.time()
+    next_response_time = start_time
+
+    def generate():
+        nonlocal next_response_time
+        while True:
+            health, msg = is_server_running()
+            now = time.time()
+
+            elapsed = time.time() - start_time
+
+            if health:
+                ports_status = {
+                    "api_port": FD_API_PORT if is_port_in_use(FD_API_PORT) else None,
+                    "queue_port": FD_ENGINE_QUEUE_PORT if is_port_in_use(FD_ENGINE_QUEUE_PORT) else None,
+                    "metrics_port": FD_METRICS_PORT if is_port_in_use(FD_METRICS_PORT) else None,
+                }
+                msg["status"] = "服务启动完成"
+                msg["ports_status"] = ports_status
+                yield json.dumps(msg, ensure_ascii=False) + "\n"
+                break
+
+            if elapsed >= timeout:
+
+                def tail_file(path, lines=50):
+                    try:
+                        with open(path, "r", encoding="utf-8", errors="ignore") as f:
+                            return "".join(f.readlines()[-lines:])
+                    except Exception as e:
+                        return f"[无法读取 {path}]: {e}\n"
+
+                result = f"服务启动超时，耗时：[{timeout}s]\n\n"
+                result += "==== server.log tail 50 ====\n"
+                result += tail_file("server.log")
+                result += "\n==== log/workerlog.0 tail 50 ====\n"
+                result += tail_file("log/workerlog.0")
+
+                yield result
+                break
+
+            if now >= next_response_time:
+                msg["status"] = f"服务启动中，耗时：[{int(elapsed)}s]"
+                yield json.dumps(msg, ensure_ascii=False) + "\n"
+                next_response_time += response_interval
+
+            time.sleep(interval)
+
+    return Response(generate(), status=200, content_type="text/plain")
+
+
+if __name__ == "__main__":
+    print(f"FLASK_PORT: {FLASK_PORT}")
+    print(f"FD_API_PORT: {FD_API_PORT}")
+    print(f"FD_ENGINE_QUEUE_PORT: {FD_ENGINE_QUEUE_PORT}")
+    print(f"FD_METRICS_PORT: {FD_METRICS_PORT}")
+    app.run(host="0.0.0.0", port=FLASK_PORT, debug=False)