[router] add tool parser base structure and partial json parser (#9482) #8
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: PR Test (PD Router) | |
on: | |
push: | |
branches: [ main ] | |
paths: | |
- 'python/sglang/srt/disaggregation/**' | |
- 'scripts/ci/ci_start_disaggregation_servers.sh' | |
- 'sgl-router/**' | |
pull_request: | |
branches: [ main ] | |
paths: | |
- 'python/sglang/srt/disaggregation/**' | |
- 'scripts/ci/ci_start_disaggregation_servers.sh' | |
- 'sgl-router/**' | |
workflow_dispatch: | |
concurrency: | |
group: test-disaggregation-${{ github.ref }} | |
cancel-in-progress: true | |
permissions: | |
contents: read | |
pull-requests: write | |
issues: write | |
jobs: | |
test-disaggregation: | |
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && | |
github.event.pull_request.draft == false | |
runs-on: [h200] | |
timeout-minutes: 45 | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
with: | |
fetch-depth: 10 | |
- name: Setup Python | |
uses: actions/setup-python@v4 | |
with: | |
python-version: '3.12' | |
- name: Setup Rust | |
run: | | |
bash scripts/ci/ci_install_rust.sh | |
- name: Cache Rust dependencies | |
uses: actions/cache@v4 | |
with: | |
path: | | |
~/.cargo/bin/ | |
~/.cargo/registry/index/ | |
~/.cargo/registry/cache/ | |
~/.cargo/git/db/ | |
sgl-router/target/ | |
key: ${{ runner.os }}-cargo-${{ hashFiles('sgl-router/Cargo.lock') }} | |
restore-keys: | | |
${{ runner.os }}-cargo- | |
- name: Cache pip dependencies | |
uses: actions/cache@v4 | |
with: | |
path: ~/.cache/pip | |
key: ${{ runner.os }}-pip-${{ hashFiles('python/pyproject.toml') }} | |
restore-keys: | | |
${{ runner.os }}-pip- | |
- name: Validate environment | |
run: | | |
echo "=== System Validation ===" | |
nvidia-smi | |
echo "GPU count: $(nvidia-smi -L | wc -l)" | |
if [ $(nvidia-smi -L | wc -l) -lt 8 ]; then | |
echo "Error: This test requires at least 8 GPUs" | |
exit 1 | |
fi | |
echo "=== RDMA Validation ===" | |
if ! command -v ibv_devices >/dev/null 2>&1; then | |
echo "Error: InfiniBand tools not found" | |
exit 1 | |
fi | |
# Check for active IB devices | |
found_active_device=false | |
for device in mlx5_{0..11}; do | |
if ibv_devinfo $device >/dev/null 2>&1; then | |
state=$(ibv_devinfo $device | grep "state:" | head -1 | awk '{print $2}') | |
if [[ "$state" == "PORT_ACTIVE" ]]; then | |
echo "✓ Found active device: $device" | |
found_active_device=true | |
break | |
fi | |
fi | |
done | |
if [ "$found_active_device" = false ]; then | |
echo "Error: No active IB devices found" | |
echo "Available devices:" | |
ibv_devices || true | |
exit 1 | |
fi | |
echo "=== Model Validation ===" | |
if [ ! -d "/raid/models/meta-llama/Llama-3.1-8B-Instruct" ]; then | |
echo "Error: Model not found" | |
ls -la /raid/models/ || echo "No models directory" | |
exit 1 | |
fi | |
echo "✓ Model found" | |
- name: Install SGLang dependencies | |
run: | | |
echo "Installing SGLang with all extras..." | |
python3 -m pip --no-cache-dir install --upgrade pip | |
python3 -m pip --no-cache-dir install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126 | |
python3 -m pip --no-cache-dir install -e "python[all]" --break-system-packages | |
python3 -m pip --no-cache-dir install mooncake-transfer-engine==0.3.5 | |
python3 -m pip --no-cache-dir install --user --force-reinstall genai-bench==0.0.1 | |
python3 -m pip --no-cache-dir install sgl-kernel==0.3.5 | |
- name: Build and install sgl-router | |
run: | | |
source "$HOME/.cargo/env" | |
echo "Building sgl-router..." | |
cd sgl-router | |
cargo build && python3 -m build && pip install --force-reinstall dist/*.whl | |
- name: Start disaggregation servers | |
id: start_servers | |
run: | | |
echo "Starting disaggregation servers..." | |
bash scripts/ci/ci_start_disaggregation_servers.sh & | |
SERVER_PID=$! | |
echo "server_pid=$SERVER_PID" >> $GITHUB_OUTPUT | |
# Wait for all 8 servers to be healthy (script already does this) | |
wait_count=0 | |
while [ $wait_count -lt 30 ]; do | |
if ps -p $SERVER_PID > /dev/null; then | |
# Check if the startup script printed success message | |
sleep 2 | |
wait_count=$((wait_count + 1)) | |
else | |
# Script exited - check if it was successful | |
wait $SERVER_PID | |
exit_code=$? | |
if [ $exit_code -eq 0 ]; then | |
echo "✓ All disaggregation servers are healthy" | |
break | |
else | |
echo "Error: Server startup failed with code $exit_code" | |
exit 1 | |
fi | |
fi | |
done | |
echo "✓ Servers started (PID: $SERVER_PID)" | |
- name: Test all policies sequentially | |
timeout-minutes: 30 | |
run: | | |
POLICIES=("random" "round_robin" "cache_aware" "power_of_two") | |
BASE_URL="http://127.0.0.9:8000" | |
for policy in "${POLICIES[@]}"; do | |
echo "" | |
echo "==================================================" | |
echo "Testing policy: $policy" | |
echo "==================================================" | |
# Start router with the current policy | |
echo "Starting router with policy: $policy..." | |
python3 -m sglang_router.launch_router \ | |
--pd-disaggregation \ | |
--policy "$policy" \ | |
--prefill http://127.0.0.1:30001 9001 \ | |
--prefill http://127.0.0.2:30002 9002 \ | |
--prefill http://127.0.0.3:30003 9003 \ | |
--prefill http://127.0.0.4:30004 9004 \ | |
--decode http://127.0.0.5:30005 \ | |
--decode http://127.0.0.6:30006 \ | |
--decode http://127.0.0.7:30007 \ | |
--decode http://127.0.0.8:30008 \ | |
--host 127.0.0.9 \ | |
--port 8000 & | |
ROUTER_PID=$! | |
# Wait for router to become healthy | |
echo "Waiting for router to become healthy..." | |
TIMEOUT=60 | |
ELAPSED=0 | |
while [ $ELAPSED -lt $TIMEOUT ]; do | |
if curl --connect-timeout 5 --silent http://127.0.0.9:8000 > /dev/null 2>&1; then | |
echo "✓ Router is reachable" | |
break | |
fi | |
if ! ps -p $ROUTER_PID > /dev/null; then | |
echo "Error: Router process died" | |
exit 1 | |
fi | |
sleep 5 | |
ELAPSED=$((ELAPSED + 5)) | |
done | |
if [ $ELAPSED -ge $TIMEOUT ]; then | |
echo "Error: Router health check timeout" | |
kill $ROUTER_PID 2>/dev/null || true | |
exit 1 | |
fi | |
# Test API functionality | |
echo "Testing API completions for $policy..." | |
response=$(curl -s -X POST "$BASE_URL/v1/chat/completions" \ | |
-H "Content-Type: application/json" \ | |
-H "Authorization: Bearer test-token" \ | |
-d '{ | |
"model": "/raid/models/meta-llama/Llama-3.1-8B-Instruct", | |
"messages": [ | |
{"role": "user", "content": "Write a Python function to calculate fibonacci numbers recursively"} | |
], | |
"stream": false, | |
"max_tokens": 100 | |
}') | |
if echo "$response" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then | |
echo "✓ API test passed for $policy" | |
else | |
echo "✗ API test failed for $policy: $response" | |
kill $ROUTER_PID 2>/dev/null || true | |
exit 1 | |
fi | |
# Test streaming | |
echo "Testing streaming API for $policy..." | |
stream_response=$(timeout 30 curl -s -X POST "$BASE_URL/v1/chat/completions" \ | |
-H "Content-Type: application/json" \ | |
-H "Authorization: Bearer test-token" \ | |
-d '{ | |
"model": "/raid/models/meta-llama/Llama-3.1-8B-Instruct", | |
"messages": [ | |
{"role": "user", "content": "Count from 1 to 5"} | |
], | |
"stream": true, | |
"max_tokens": 50 | |
}') | |
if echo "$stream_response" | grep -q "data:"; then | |
echo "✓ Streaming API test passed for $policy" | |
else | |
echo "✗ Streaming API test failed for $policy" | |
kill $ROUTER_PID 2>/dev/null || true | |
exit 1 | |
fi | |
# Run genai-bench benchmark | |
echo "Running genai-bench for $policy..." | |
genai-bench benchmark \ | |
--api-backend openai \ | |
--api-base "http://127.0.0.9:8000" \ | |
--api-key "dummy-token" \ | |
--api-model-name "/raid/models/meta-llama/Llama-3.1-8B-Instruct" \ | |
--model-tokenizer /raid/models/meta-llama/Llama-3.1-8B-Instruct \ | |
--task text-to-text \ | |
--num-concurrency 64 \ | |
--traffic-scenario "D(8000,2000)" \ | |
--max-requests-per-run 640 \ | |
--max-time-per-run 2 \ | |
--experiment-folder-name "benchmark_${policy}" \ | |
--experiment-base-dir "." | |
# Find the actual experiment folder | |
actual_folder=$(find . -maxdepth 1 -name "benchmark_${policy}" -type d | head -1) | |
if [ -n "$actual_folder" ]; then | |
# Extract metrics from the Excel summary or JSON files | |
summary_file="$actual_folder"/*_summary.xlsx | |
json_files=$(find "$actual_folder" -name "*.json" | grep -v experiment_metadata) | |
echo "Genai-bench results saved in: $actual_folder" | |
# Extract mean values and validate performance thresholds | |
echo "📊 Extracting performance metrics for $policy..." | |
# Find JSON files excluding experiment metadata | |
json_files=$(find "$actual_folder" -name "*.json" | grep -v experiment_metadata) | |
if [ -n "$json_files" ]; then | |
# Extract metrics using jq and validate against loose thresholds | |
for json_file in $json_files; do | |
echo "Processing: $(basename "$json_file")" | |
# Extract mean values for performance validation | |
ttft_mean=$(jq -r '.aggregated_metrics.stats.ttft.mean' "$json_file") | |
e2e_latency_mean=$(jq -r '.aggregated_metrics.stats.e2e_latency.mean' "$json_file") | |
input_throughput_mean=$(jq -r '.aggregated_metrics.stats.input_throughput.mean' "$json_file") | |
output_throughput_mean=$(jq -r '.aggregated_metrics.stats.output_throughput.mean' "$json_file") | |
echo " TTFT mean: ${ttft_mean}s" | |
echo " E2E Latency mean: ${e2e_latency_mean}s" | |
echo " Input Throughput mean: ${input_throughput_mean} tokens/s" | |
echo " Output Throughput mean: ${output_throughput_mean} tokens/s" | |
# Set mean thresholds (allowing for reasonable variance) | |
# These can be adjusted based on your performance requirements | |
ttft_threshold=2.0 # Max 2.0 seconds for mean TTFT | |
e2e_latency_threshold=24.0 # Max 8.0 seconds for mean E2E latency | |
input_throughput_threshold=10000 # Min 9000 tokens/s for mean input throughput | |
output_throughput_threshold=90 # Min 100 tokens/s for mean output throughput | |
# Validate mean thresholds | |
validation_passed=true | |
if (( $(echo "$ttft_mean > $ttft_threshold" | bc -l) )); then | |
echo "❌ TTFT validation failed: $ttft_mean > $ttft_threshold" | |
validation_passed=false | |
fi | |
if (( $(echo "$e2e_latency_mean > $e2e_latency_threshold" | bc -l) )); then | |
echo "❌ E2E Latency validation failed: $e2e_latency_mean > $e2e_latency_threshold" | |
validation_passed=false | |
fi | |
if (( $(echo "$input_throughput_mean < $input_throughput_threshold" | bc -l) )); then | |
echo "❌ Input Throughput validation failed: $input_throughput_mean < $input_throughput_threshold" | |
validation_passed=false | |
fi | |
if (( $(echo "$output_throughput_mean < $output_throughput_threshold" | bc -l) )); then | |
echo "❌ Output Throughput validation failed: $output_throughput_mean < $output_throughput_threshold" | |
validation_passed=false | |
fi | |
if [ "$validation_passed" = true ]; then | |
echo "✅ Performance validation passed for $policy" | |
else | |
echo "❌ Performance validation failed for $policy" | |
kill $ROUTER_PID 2>/dev/null || true | |
exit 1 | |
fi | |
done | |
echo "✓ Genai-bench completed successfully for $policy" | |
echo "📊 Detailed metrics and plots available in: $actual_folder" | |
else | |
echo "✗ Benchmark failed for $policy: No JSON results found" | |
kill $ROUTER_PID 2>/dev/null || true | |
exit 1 | |
fi | |
else | |
echo "✗ Benchmark failed for $policy: Experiment folder not found" | |
kill $ROUTER_PID 2>/dev/null || true | |
exit 1 | |
fi | |
# Stop router before testing next policy | |
echo "Stopping router for $policy..." | |
# First try graceful shutdown | |
kill $ROUTER_PID 2>/dev/null || true | |
# Wait up to 5 seconds for graceful shutdown | |
for i in {1..5}; do | |
if ! ps -p $ROUTER_PID > /dev/null 2>&1; then | |
echo "Router stopped gracefully" | |
break | |
fi | |
sleep 1 | |
done | |
# Force kill if still running | |
if ps -p $ROUTER_PID > /dev/null 2>&1; then | |
echo "Force killing router..." | |
kill -9 $ROUTER_PID 2>/dev/null || true | |
fi | |
# Short delay to ensure port is released | |
sleep 2 | |
echo "✓ Completed testing for $policy" | |
done | |
echo "" | |
echo "✅ All policies tested successfully!" | |
- name: Upload benchmark results | |
if: success() | |
uses: actions/upload-artifact@v4 | |
with: | |
name: genai-bench-results-all-policies | |
path: benchmark_**/ | |
- name: Cleanup servers | |
if: always() | |
run: | | |
if [ -n "${{ steps.start_servers.outputs.server_pid }}" ]; then | |
pkill -P ${{ steps.start_servers.outputs.server_pid }} || true | |
kill ${{ steps.start_servers.outputs.server_pid }} || true | |
fi | |
pkill -f "sglang.launch_server" || true | |
sleep 5 | |
remaining=$(ps aux | grep -c "sglang.launch_server" || echo "0") | |
echo "Cleanup completed. Remaining processes: $remaining" | |
summarize-benchmarks: | |
needs: test-disaggregation | |
runs-on: ubuntu-latest | |
if: success() | |
steps: | |
- name: Install jq | |
run: sudo apt-get update && sudo apt-get install -y jq bc | |
- name: Download benchmark results | |
uses: actions/download-artifact@v4 | |
with: | |
name: genai-bench-results-all-policies | |
- name: List downloaded contents | |
run: | | |
echo "Contents after download:" | |
ls -la | |
find . -name "benchmark_*" -type d | |
echo "JSON files found:" | |
find . -name "*.json" | head -10 | |
- name: Create benchmark summary | |
run: | | |
echo "=== DEBUG: Creating benchmark summary ===" | |
echo "Available benchmark directories:" | |
find . -name "benchmark_*" -type d | |
echo "==========================================" | |
echo "## PD Router Genai-Bench Results Summary" >> $GITHUB_STEP_SUMMARY | |
echo "" >> $GITHUB_STEP_SUMMARY | |
echo "🚀 **Benchmarked with genai-bench for comprehensive LLM serving performance evaluation**" >> $GITHUB_STEP_SUMMARY | |
echo "" >> $GITHUB_STEP_SUMMARY | |
echo "| Policy | Status | TTFT (s) | E2E Latency (s) | Input Throughput (tok/s) | Output Throughput (tok/s) |" >> $GITHUB_STEP_SUMMARY | |
echo "|--------|--------|----------|-----------------|--------------------------|---------------------------|" >> $GITHUB_STEP_SUMMARY | |
# First, complete the table with all policies | |
for policy in random round_robin cache_aware power_of_two; do | |
# Find genai-bench result folders for this policy (handle zip extraction structure) | |
result_folder=$(find . -maxdepth 2 -name "benchmark_${policy}" -type d | head -1) | |
if [ -z "$result_folder" ]; then | |
# Try alternative patterns in case of different extraction structure | |
result_folder=$(find . -maxdepth 3 -path "*benchmark_${policy}*" -type d | head -1) | |
fi | |
echo "DEBUG: Policy ${policy} -> Found folder: ${result_folder:-'NOT FOUND'}" | |
if [ -n "$result_folder" ] && [ -d "$result_folder" ]; then | |
# Find JSON file with metrics | |
json_file=$(find "$result_folder" -name "*.json" -not -name "experiment_metadata.json" | head -1) | |
if [ -n "$json_file" ] && [ -f "$json_file" ]; then | |
# Extract performance metrics | |
ttft_mean=$(jq -r '.aggregated_metrics.stats.ttft.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A") | |
e2e_latency_mean=$(jq -r '.aggregated_metrics.stats.e2e_latency.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A") | |
input_throughput_mean=$(jq -r '.aggregated_metrics.stats.input_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A") | |
output_throughput_mean=$(jq -r '.aggregated_metrics.stats.output_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A") | |
# Format numbers for display (2 decimal places) | |
if [ "$ttft_mean" != "N/A" ] && [ "$ttft_mean" != "null" ]; then | |
ttft_display=$(printf "%.2f" "$ttft_mean" 2>/dev/null || echo "$ttft_mean") | |
else | |
ttft_display="N/A" | |
fi | |
if [ "$e2e_latency_mean" != "N/A" ] && [ "$e2e_latency_mean" != "null" ]; then | |
e2e_display=$(printf "%.2f" "$e2e_latency_mean" 2>/dev/null || echo "$e2e_latency_mean") | |
else | |
e2e_display="N/A" | |
fi | |
if [ "$input_throughput_mean" != "N/A" ] && [ "$input_throughput_mean" != "null" ]; then | |
input_display=$(printf "%.0f" "$input_throughput_mean" 2>/dev/null || echo "$input_throughput_mean") | |
else | |
input_display="N/A" | |
fi | |
if [ "$output_throughput_mean" != "N/A" ] && [ "$output_throughput_mean" != "null" ]; then | |
output_display=$(printf "%.0f" "$output_throughput_mean" 2>/dev/null || echo "$output_throughput_mean") | |
else | |
output_display="N/A" | |
fi | |
echo "| ${policy} | ✅ Success | $ttft_display | $e2e_display | $input_display | $output_display |" >> $GITHUB_STEP_SUMMARY | |
else | |
echo "| ${policy} | ❌ No Data | N/A | N/A | N/A | N/A |" >> $GITHUB_STEP_SUMMARY | |
fi | |
else | |
echo "| ${policy} | ❌ Failed | N/A | N/A | N/A | N/A |" >> $GITHUB_STEP_SUMMARY | |
fi | |
done | |
# Add performance validation summary | |
echo "" >> $GITHUB_STEP_SUMMARY | |
echo "## 📊 Performance Validation" >> $GITHUB_STEP_SUMMARY | |
echo "" >> $GITHUB_STEP_SUMMARY | |
echo "**Thresholds:** TTFT ≤ 2.0s | E2E Latency ≤ 8.0s | Input Throughput ≥ 10,000 tok/s | Output Throughput ≥ 100 tok/s" >> $GITHUB_STEP_SUMMARY | |
echo "" >> $GITHUB_STEP_SUMMARY | |
validation_summary="" | |
for policy in random round_robin cache_aware power_of_two; do | |
# Use same robust path finding as above | |
result_folder=$(find . -maxdepth 2 -name "benchmark_${policy}" -type d | head -1) | |
if [ -z "$result_folder" ]; then | |
result_folder=$(find . -maxdepth 3 -path "*benchmark_${policy}*" -type d | head -1) | |
fi | |
if [ -n "$result_folder" ] && [ -d "$result_folder" ]; then | |
json_file=$(find "$result_folder" -name "*.json" -not -name "experiment_metadata.json" | head -1) | |
if [ -n "$json_file" ] && [ -f "$json_file" ]; then | |
# Extract metrics for validation | |
ttft=$(jq -r '.aggregated_metrics.stats.ttft.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A") | |
e2e_latency=$(jq -r '.aggregated_metrics.stats.e2e_latency.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A") | |
input_throughput=$(jq -r '.aggregated_metrics.stats.input_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A") | |
output_throughput=$(jq -r '.aggregated_metrics.stats.output_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A") | |
# Check thresholds (using same values as in main workflow) | |
validation_status="✅" | |
if [ "$ttft" != "N/A" ] && [ "$ttft" != "null" ]; then | |
if (( $(echo "$ttft > 2.0" | bc -l 2>/dev/null || echo "0") )); then | |
validation_status="❌" | |
fi | |
fi | |
if [ "$e2e_latency" != "N/A" ] && [ "$e2e_latency" != "null" ]; then | |
if (( $(echo "$e2e_latency > 24.0" | bc -l 2>/dev/null || echo "0") )); then | |
validation_status="❌" | |
fi | |
fi | |
if [ "$input_throughput" != "N/A" ] && [ "$input_throughput" != "null" ]; then | |
if (( $(echo "$input_throughput < 10000" | bc -l 2>/dev/null || echo "0") )); then | |
validation_status="❌" | |
fi | |
fi | |
if [ "$output_throughput" != "N/A" ] && [ "$output_throughput" != "null" ]; then | |
if (( $(echo "$output_throughput < 90" | bc -l 2>/dev/null || echo "0") )); then | |
validation_status="❌" | |
fi | |
fi | |
validation_summary="${validation_summary}- **${policy}**: $validation_status\n" | |
else | |
validation_summary="${validation_summary}- **${policy}**: ❌ No data\n" | |
fi | |
else | |
validation_summary="${validation_summary}- **${policy}**: ❌ Failed\n" | |
fi | |
done | |
echo -e "$validation_summary" >> $GITHUB_STEP_SUMMARY | |
echo "" >> $GITHUB_STEP_SUMMARY | |
echo "## 📊 Genai-Bench Features Used" >> $GITHUB_STEP_SUMMARY | |
echo "- **Token-level Performance**: TTFT, TPOT, End-to-End latency" >> $GITHUB_STEP_SUMMARY | |
echo "- **Throughput Analysis**: Input/Output/Total token throughput" >> $GITHUB_STEP_SUMMARY | |
echo "- **Statistical Analysis**: Percentiles, mean, std dev for all metrics" >> $GITHUB_STEP_SUMMARY | |
echo "- **Visual Reports**: Automated plots and Excel summaries" >> $GITHUB_STEP_SUMMARY | |
echo "- **SGLang Backend**: Native integration with SGLang serving" >> $GITHUB_STEP_SUMMARY | |
echo "" >> $GITHUB_STEP_SUMMARY | |
echo "✅ All policies tested successfully with genai-bench!" >> $GITHUB_STEP_SUMMARY |