@@ -48,12 +48,6 @@ def override_quantization_method(self, *args, **kwargs):
48
48
from sglang .srt .layers .quantization .compressed_tensors .compressed_tensors import (
49
49
CompressedTensorsConfig ,
50
50
)
51
- from sglang .srt .utils import is_cuda , is_hip , mxfp_supported
52
-
53
- is_mxfp_supported = mxfp_supported ()
54
- if is_mxfp_supported :
55
- from sglang .srt .layers .quantization .fp4 import MxFp4Config
56
-
57
51
from sglang .srt .layers .quantization .fp8 import Fp8Config
58
52
from sglang .srt .layers .quantization .gptq import GPTQConfig , GPTQMarlinConfig
59
53
from sglang .srt .layers .quantization .modelopt_quant import (
@@ -67,6 +61,9 @@ def override_quantization_method(self, *args, **kwargs):
67
61
from sglang .srt .layers .quantization .w4afp8 import W4AFp8Config
68
62
from sglang .srt .layers .quantization .w8a8_fp8 import W8A8Fp8Config
69
63
from sglang .srt .layers .quantization .w8a8_int8 import W8A8Int8Config
64
+ from sglang .srt .utils import is_cuda , is_hip , mxfp_supported
65
+
66
+ _is_mxfp_supported = mxfp_supported ()
70
67
71
68
if TYPE_CHECKING :
72
69
from sglang .srt .layers .moe .topk import TopKOutput
@@ -98,11 +95,13 @@ def override_quantization_method(self, *args, **kwargs):
98
95
"mxfp4" : Mxfp4Config ,
99
96
}
100
97
)
101
- elif is_mxfp_supported and is_hip ():
98
+ elif _is_mxfp_supported and is_hip ():
99
+ from sglang .srt .layers .quantization .quark .quark import QuarkConfig
100
+
102
101
BASE_QUANTIZATION_METHODS .update (
103
102
{
104
- "quark" : MxFp4Config ,
105
- "mxfp4" : MxFp4Config ,
103
+ "quark" : QuarkConfig ,
104
+ "mxfp4" : Mxfp4Config ,
106
105
}
107
106
)
108
107
# VLLM-dependent quantization methods
0 commit comments