PaddlePaddle · gzy19990617 · Aug 14, 2025 · Aug 14, 2025 · Aug 15, 2025 · Aug 15, 2025
diff --git a/test/ci_use/QwQ-32B/run_server.sh b/test/ci_use/QwQ-32B/run_server.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+MODEL_PATH=${1}
+
+if [ -z "$MODEL_PATH" ]; then
+  echo "❌ 用法: $0 <模型路径>"
+  exit 1
+fi
+
+if [ ! -d "$MODEL_PATH" ]; then
+  echo "❌ 错误：模型目录不存在: $MODEL_PATH"
+  exit 1
+fi
+
+echo "📁 使用模型: $MODEL_PATH"
+
+
+# 清理日志
+rm -rf log/*
+mkdir -p log
+
+# 环境变量
+export CUDA_VISIBLE_DEVICES=0,1
+export INFERENCE_MSG_QUEUE_ID=7679
+export ENABLE_V1_KVCACHE_SCHEDULER=1
+
+
+python -m fastdeploy.entrypoints.openai.api_server \
+       --tensor-parallel-size 2 \
+       --port 8787 \
+       --engine-worker-queue-port 7679 \
+       --metrics-port 7877 \
+       --quantization wint8 \
+       --max-model-len 32768 \
+       --max-num-seqs 256 \
+       --gpu-memory-utilization 0.9 \
+       --model "$MODEL_PATH" \
+       --load-strategy ipc_snapshot \
+       --dynamic-load-weight
diff --git a/test/ci_use/QwQ-32B/run_test.sh b/test/ci_use/QwQ-32B/run_test.sh
@@ -0,0 +1,155 @@
+#!/bin/bash
+
+# ================== Configuration Parameters ==================
+
+HOST="0.0.0.0"
+PORT="8787"
+BASE_URL="http://$HOST:$PORT"
+
+TOTAL_ROUNDS=100
+CHAT_REQUESTS_PER_ROUND=5
+export CUDA_VISIBLE_DEVICES=0,1
+MAX_MEMORY_MB=10240  # 10GB
+
+# ====================================================
+# assert_eq actual expected message
+assert_eq() {
+    local actual="$1"
+    local expected="$2"
+    local msg="$3"
+    if [ "$actual" != "$expected" ]; then
+        echo "Assertion failed: $msg" >&2
+        exit 1
+    fi
+}
+
+# assert_true condition message
+assert_true() {
+    local condition="$1"
+    local msg="$2"
+    if [ "$condition" != "1" ] && [ "$condition" != "true" ]; then
+        echo "Assertion failed: $msg" >&2
+        exit 1
+    fi
+}
+
+# assert_success exit_code message
+assert_success() {
+    local code="$1"
+    local msg="$2"
+    if [ "$code" -ne 0 ]; then
+        echo "Assertion failed: $msg" >&2
+        exit 1
+    fi
+}
+
+# curl_get_status(url, options...) → returns via global variables http_code and response_body
+curl_get_status() {
+    local result
+    result=$(curl -s -w "%{http_code}" "$@")
+    http_code="${result: -3}"
+    response_body="${result%???}"
+}
+
+# ====================================================
+# Get visible GPU IDs from CUDA_VISIBLE_DEVICES
+# ====================================================
+
+get_visible_gpu_ids() {
+    local ids=()
+    IFS=',' read -ra ADDR <<< "$CUDA_VISIBLE_DEVICES"
+    for i in "${ADDR[@]}"; do
+        if [[ "$i" =~ ^[0-9]+$ ]]; then
+            ids+=("$i")
+        fi
+    done
+    echo "${ids[@]}"
+}
+
+# ====================================================
+# Check GPU memory usage (must not exceed MAX_MEMORY_MB)
+# ====================================================
+
+check_gpu_memory() {
+    local gpu_ids
+    gpu_ids=($(get_visible_gpu_ids))
+
+    if [ ${#gpu_ids[@]} -eq 0 ]; then
+        echo "Assertion failed: No valid GPU IDs in CUDA_VISIBLE_DEVICES='$CUDA_VISIBLE_DEVICES'" >&2
+        exit 1
+    fi
+
+    for gpu_id in "${gpu_ids[@]}"; do
+        local memory_used
+        memory_used=$(nvidia-smi -i "$gpu_id" --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null) || \
+            assert_success $? "Failed to query GPU $gpu_id memory usage"
+
+        if ! [[ "$memory_used" =~ ^[0-9]+ ]]; then
+            echo "Assertion failed: Invalid memory value for GPU $gpu_id: $memory_used" >&2
+            exit 1
+        fi
+
+        assert_true "$(( memory_used <= MAX_MEMORY_MB ))" \
+            "GPU $gpu_id memory $memory_used MB > $MAX_MEMORY_MB MB"
+    done
+}
+
+# ====================================================
+
+for round in $(seq 1 $TOTAL_ROUNDS); do
+    echo "=== Round $round / $TOTAL_ROUNDS ==="
+
+    # Step 1: Clear loaded weights
+    echo "[Step 1] Clearing load weight..."
+    curl_get_status -i "$BASE_URL/clear_load_weight"
+    assert_eq "$http_code" "200" "/clear_load_weight failed with HTTP $http_code"
+
+    # Step 2: Check GPU memory usage
+    echo "[Step 2] Checking GPU memory..."
+    check_gpu_memory
+
+    # Step 3: Update model weights
+    echo "[Step 3] Updating model weight..."
+    curl_get_status -i "$BASE_URL/update_model_weight"
+    assert_eq "$http_code" "200" "/update_model_weight failed with HTTP $http_code"
+
+    # Step 4: Send chat completion requests
+    echo "[Step 4] Sending $CHAT_REQUESTS_PER_ROUND chat completions..."
+    for i in $(seq 1 $CHAT_REQUESTS_PER_ROUND); do
+        echo "  Request $i / $CHAT_REQUESTS_PER_ROUND"
+        # Send request and capture response
+        response=$(curl -s -X POST "$BASE_URL/v1/chat/completions" \
+            -H "Content-Type: application/json" \
+            -d '{"messages": [{"role": "user", "content": "Hello!"}]}')
+
+        # Extract the 'content' field from the response
+        content=$(echo "$response" | \
+            grep -o '"content":"[^"]*"' | \
+            head -1 | \
+            sed 's/^"content":"//' | \
+            sed 's/"$//')
+
+        if [ -z "$content" ]; then
+            # Fallback: try extracting content using sed more robustly
+            content=$(echo "$response" | \
+                sed -n 's/.*"content":"\([^"]*\)".*/\1/p' | \
+                head -1)
+        fi
+
+        # Check if content is empty or null
+        if [ -z "$content" ] || [ "$content" = "null" ]; then
+            echo "❌ Failed: Empty or null 'content' in response" >&2
+            echo "Raw response:" >&2
+            echo "$response" >&2
+            exit 1
+        fi
+
+        echo "✅ Received non-empty response"
+        echo -e "\n---\n"
+    done
+
+    echo "Round $round completed."
+    echo "==================================\n"
+done
+
+echo "✅ All $TOTAL_ROUNDS rounds completed successfully."