protagolabs
diff --git a/‎benchmark/deepseek_v3/README.md
Lines changed: 3 additions & 2 deletions b/‎benchmark/deepseek_v3/README.md
Lines changed: 3 additions & 2 deletions
diff --git a/‎python/sglang/srt/configs/model_config.py
Lines changed: 1 addition & 0 deletions b/‎python/sglang/srt/configs/model_config.py
Lines changed: 1 addition & 0 deletions
@@ -178,10 +178,11 @@ python3 -m sglang.bench_one_batch_server --model None --base-url http://10.0.0.1
 
 ### Example: Serving with 8 A100/A800 with AWQ Quantization
 
-AWQ does not support BF16, so add the `--dtype half` flag if AWQ is used for quantization. One example is as follows:
+Add `--quantization moe_wna16` flag to enable moe wna16 kernel for better performance.
+One example is as follows:
 
 ```bash
-python3 -m sglang.launch_server --model cognitivecomputations/DeepSeek-R1-AWQ --tp 8 --trust-remote-code --dtype half
+python3 -m sglang.launch_server --model cognitivecomputations/DeepSeek-R1-AWQ --tp 8 --trust-remote-code --quantization moe_wna16
 ```
 
 
 
@@ -258,6 +258,7 @@ def _verify_quantization(self) -> None:
             "experts_int8",
             "w8a8_int8",
             "w8a8_fp8",
+            "moe_wna16",
         ]
         compatible_quantization_methods = {
             "w8a8_int8": ["compressed-tensors", "compressed_tensors"],
Original file line number	Diff line number	Diff line change
`@@ -258,6 +258,7 @@ def _verify_quantization(self) -> None:`
`258`	`258`	`"experts_int8",`
`259`	`259`	`"w8a8_int8",`
`260`	`260`	`"w8a8_fp8",`
	`261`	`+ "moe_wna16",`
`261`	`262`	`]`
`262`	`263`	`compatible_quantization_methods = {`
`263`	`264`	`"w8a8_int8": ["compressed-tensors", "compressed_tensors"],`