pi314ever
diff --git a/‎.github/workflows/nightly-test.yml
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/nightly-test.yml
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/pr-test-amd.yml
Lines changed: 7 additions & 7 deletions b/‎.github/workflows/pr-test-amd.yml
Lines changed: 7 additions & 7 deletions
diff --git a/‎.github/workflows/pr-test-sgl-kernel.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pr-test-sgl-kernel.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/pr-test.yml
Lines changed: 25 additions & 20 deletions b/‎.github/workflows/pr-test.yml
Lines changed: 25 additions & 20 deletions
diff --git a/‎.github/workflows/vllm-dependency-test.yml
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/vllm-dependency-test.yml
Lines changed: 0 additions & 2 deletions
diff --git a/‎3rdparty/amd/tuning/benchmark_moe_rocm.py
Lines changed: 1 addition & 1 deletion b/‎3rdparty/amd/tuning/benchmark_moe_rocm.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎Makefile
Lines changed: 2 additions & 1 deletion b/‎Makefile
Lines changed: 2 additions & 1 deletion
diff --git a/‎README.md
Lines changed: 3 additions & 3 deletions b/‎README.md
Lines changed: 3 additions & 3 deletions
@@ -25,7 +25,6 @@ jobs:
       - name: Install dependencies
         run: |
           bash scripts/ci_install_dependency.sh
-          pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
 
       - name: Run test
         timeout-minutes: 120
 
@@ -38,12 +38,12 @@ jobs:
           else
             DEVICE_FLAG="--device /dev/dri"
           fi
-          docker pull lmsysorg/sglang:v0.4.5-rocm630
+          docker pull ghcr.io/saienduri/sglang-aiter-v0.1.1:428
           docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
             -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
             --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
             -w /sglang-checkout --name ci_sglang \
-            lmsysorg/sglang:v0.4.5-rocm630
+            ghcr.io/saienduri/sglang-aiter-v0.1.1:428
 
       - name: Install dependencies
         run: |
@@ -82,12 +82,12 @@ jobs:
           else
             DEVICE_FLAG="--device /dev/dri"
           fi
-          docker pull lmsysorg/sglang:v0.4.5-rocm630
+          docker pull ghcr.io/saienduri/sglang-aiter-v0.1.1:428
           docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
             -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
             --cap-add=SYS_PTRACE -e HF_TOKEN=${{ secrets.AMD_HF_TOKEN }} --security-opt seccomp=unconfined \
             -w /sglang-checkout --name ci_sglang \
-            lmsysorg/sglang:v0.4.5-rocm630
+            ghcr.io/saienduri/sglang-aiter-v0.1.1:428
 
       - name: Install dependencies
         run: |
@@ -120,12 +120,12 @@ jobs:
           else
             DEVICE_FLAG="--device /dev/dri"
           fi
-          docker pull lmsysorg/sglang:v0.4.5-rocm630
+          docker pull ghcr.io/saienduri/sglang-aiter-v0.1.1:428
           docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
             -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
             --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
             -w /sglang-checkout --name ci_sglang \
-            lmsysorg/sglang:v0.4.5-rocm630
+            ghcr.io/saienduri/sglang-aiter-v0.1.1:428
 
       - name: Install dependencies
         run: |
@@ -149,7 +149,7 @@ jobs:
   finish:
     if: always()
     needs: [
-      accuracy-test-1-gpu-amd, mla-test-1-gpu-amd
+      accuracy-test-1-gpu-amd, mla-test-1-gpu-amd, bench-test-2-gpu-amd
     ]
     runs-on: ubuntu-latest
     steps:
 
@@ -88,7 +88,7 @@ jobs:
       - name: Install
         run: |
           bash scripts/ci_install_dependency.sh
-          pip3 install torch==2.5.1 && pip3 install pytest
+          pip3 install torch==2.6.0 torchvision && pip3 install pytest
           pip3 uninstall sgl-kernel -y || true
           pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps
           pip3 list | grep sgl-kernel
 
@@ -38,8 +38,6 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install dependencies
-        env:
-          FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer-python' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python' }}
         run: |
           bash scripts/ci_install_dependency.sh
 
@@ -56,22 +54,20 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        part: [0, 1, 2, 3, 4, 5, 6]
+        part: [0, 1, 2, 3, 4, 5, 6, 7]
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
 
       - name: Install dependencies
-        env:
-          FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer-python' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python' }}
         run: |
           bash scripts/ci_install_dependency.sh
 
       - name: Run test
-        timeout-minutes: 40
+        timeout-minutes: 30
         run: |
           cd test/srt
-          python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 7
+          python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 8
 
   unit-test-backend-2-gpu:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -82,8 +78,6 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install dependencies
-        env:
-          FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer-python' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python' }}
         run: |
           bash scripts/ci_install_dependency.sh
 
@@ -93,10 +87,10 @@ jobs:
           cd test/srt
           python3 run_suite.py --suite per-commit-2-gpu
 
-  performance-test-1-gpu-part-1:
+  unit-test-backend-8-gpu:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
         github.event.pull_request.draft == false
-    runs-on: 1-gpu-runner
+    runs-on: 8-gpu-runner
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -107,11 +101,30 @@ jobs:
         run: |
           bash scripts/ci_install_dependency.sh
 
+      - name: Run test
+        timeout-minutes: 40
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-8-gpu
+
+  performance-test-1-gpu-part-1:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci_install_dependency.sh
+
       - name: Benchmark single latency
         timeout-minutes: 10
         run: |
           cd test/srt
-          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1
+          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
+          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
 
       - name: Benchmark online latency
         timeout-minutes: 10
@@ -146,8 +159,6 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install dependencies
-        env:
-          FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer-python' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python' }}
         run: |
           bash scripts/ci_install_dependency.sh
 
@@ -178,8 +189,6 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install dependencies
-        env:
-          FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer-python' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python' }}
         run: |
           bash scripts/ci_install_dependency.sh
 
@@ -216,8 +225,6 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install dependencies
-        env:
-          FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer-python' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python' }}
         run: |
           bash scripts/ci_install_dependency.sh
           git clone https://github.com/merrymercy/human-eval.git
@@ -239,8 +246,6 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install dependencies
-        env:
-          FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer-python' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python' }}
         run: |
           bash scripts/ci_install_dependency.sh
           git clone https://github.com/merrymercy/human-eval.git
 
@@ -28,8 +28,6 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install dependencies
-        env:
-          FLASHINFER_REPO: 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python'
         run: |
           bash scripts/ci_install_dependency.sh
           pip install "vllm>=0.6.4.post1,<=0.7.2"
 
@@ -15,7 +15,7 @@
     get_config_file_name,
 )
 
-padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0
+padding_size = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0
 
 
 def main(model, tp_size, dtype: str, batches):
 
@@ -20,7 +20,8 @@ FILES_TO_UPDATE = docker/Dockerfile.rocm \
                  python/pyproject.toml \
                  python/sglang/version.py \
                  docs/developer/setup_github_runner.md \
-                 docs/start/install.md
+                 docs/start/install.md \
+				 benchmark/deepseek_v3/README.md
 
 update: ## Update version numbers across project files. Usage: make update <new_version>
 	@if [ -z "$(filter-out $@,$(MAKECMDGOALS))" ]; then \
 
@@ -43,7 +43,7 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
 
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
 - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
@@ -71,5 +71,5 @@ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor
 
 For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
 
-## Acknowledgment and Citation
-We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
+## Acknowledgment
+We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`get_config_file_name,`
`16`	`16`	`)`
`17`	`17`
`18`		`-padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0`
	`18`	`+padding_size = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0`
`19`	`19`
`20`	`20`
`21`	`21`	`def main(model, tp_size, dtype: str, batches):`