chore: upgrade sgl-kernel 0.1.1 (sgl-project#5933)

zhyncs · xwu-intel · commit ce6119c9b8fe · 2025-06-17T06:06:17.000+03:00
diff --git a/.github/workflows/vllm-dependency-test.yml b/.github/workflows/vllm-dependency-test.yml
@@ -30,7 +30,7 @@ jobs:
       - name: Install dependencies
         run: |
           bash scripts/ci_install_dependency.sh
-          pip install "vllm>=0.6.4.post1,<=0.7.2"
+          pip install "vllm==0.8.4"
           pip install "bitsandbytes>=0.44.0"
 
       - name: Run VLLM dependency tests
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -47,7 +47,7 @@ runtime_common = [
 
 srt = [
     "sglang[runtime_common]",
-    "sgl-kernel==0.1.0",
+    "sgl-kernel==0.1.1",
     "flashinfer_python==0.2.5",
     "torch==2.6.0",
     "torchvision==0.21.0",
diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py
@@ -461,7 +461,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if _is_cuda:
         assert_pkg_version(
             "sgl-kernel",
-            "0.1.0",
+            "0.1.1",
             "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
         )
 
diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py
@@ -109,7 +109,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
         raise ValueError(
             f"{quantization} quantization requires some operators from vllm. "
-            "Pleaes install vllm by `pip install vllm==0.7.2`"
+            "Pleaes install vllm by `pip install vllm==0.8.4`"
         )
 
     return QUANTIZATION_METHODS[quantization]
@@ -310,7 +310,7 @@ def new_apply(
         if correction_bias is not None:
             if not has_correction_bias:
                 raise ValueError(
-                    "Please increase the version of your vllm. Try `pip install vllm==0.7.2`"
+                    "Please increase the version of your vllm. Try `pip install vllm==0.8.4`"
                 )
             kwargs["e_score_correction_bias"] = correction_bias
         return original_apply(**kwargs)
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
@@ -79,6 +79,7 @@
     get_available_gpu_memory,
     get_bool_env_var,
     init_custom_process_group,
+    is_ampere_with_cuda_12_3,
     is_cuda,
     is_fa3_default_architecture,
     is_flashinfer_available,
@@ -246,7 +247,7 @@ def model_specific_adjustment(self):
             if not self.use_mla_backend:
                 # MHA architecture
                 if (
-                    is_hopper_with_cuda_12_3()
+                    (is_ampere_with_cuda_12_3() or is_hopper_with_cuda_12_3())
                     and is_no_spec_infer_or_topk_one(server_args)
                     and is_fa3_default_architecture(self.model_config.hf_config)
                 ):
@@ -927,8 +928,10 @@ def init_attention_backend(self):
 
             self.attn_backend = FlashMLABackend(self)
         elif self.server_args.attention_backend == "fa3":
-            assert torch.cuda.get_device_capability()[0] >= 9, (
-                "FlashAttention v3 Backend requires SM>=90. "
+            assert (
+                torch.cuda.get_device_capability()[0] == 8 and not self.use_mla_backend
+            ) or torch.cuda.get_device_capability()[0] == 9, (
+                "FlashAttention v3 Backend requires SM>=80 and SM<=90. "
                 "Please use `--attention-backend flashinfer`."
             )
             from sglang.srt.layers.attention.flashattention_backend import (
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
@@ -1905,13 +1905,16 @@ def fast_topk(values, topk, dim):
         return torch.topk(values, topk, dim=dim)
 
 
-def is_hopper_with_cuda_12_3():
+def _check(cc_major):
     if not is_cuda():
         return False
-    is_hopper = torch.cuda.get_device_capability()[0] == 9
-    cuda_version = torch.version.cuda.split(".")
-    is_cuda_compatible = int(cuda_version[0]) == 12 and int(cuda_version[1]) >= 3
-    return is_hopper and is_cuda_compatible
+    return torch.cuda.get_device_capability()[0] == cc_major and tuple(
+        map(int, torch.version.cuda.split(".")[:2])
+    ) >= (12, 3)
+
+
+is_ampere_with_cuda_12_3 = lambda: _check(8)
+is_hopper_with_cuda_12_3 = lambda: _check(9)
 
 
 def get_free_port():
diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh
@@ -16,7 +16,7 @@ rm -rf /usr/local/lib/python3.10/dist-packages/sgl_kernel*
 pip install --upgrade pip
 
 # Install sgl-kernel
-pip install sgl-kernel==0.1.0 --no-cache-dir
+pip install sgl-kernel==0.1.1 --no-cache-dir
 
 # Install the main package
 pip install -e "python[all]"

Original file line number	Diff line number	Diff line change
`@@ -461,7 +461,7 @@ def _set_envs_and_config(server_args: ServerArgs):`
`461`	`461`	`if _is_cuda:`
`462`	`462`	`assert_pkg_version(`
`463`	`463`	`"sgl-kernel",`
`464`		`- "0.1.0",`
	`464`	`+ "0.1.1",`
`465`	`465`	"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
`466`	`466`	`)`
`467`	`467`