JustinTong0323
diff --git a/‎examples/runtime/engine/offline_batch_inference_qwen_1m.py
Lines changed: 74 additions & 0 deletions b/‎examples/runtime/engine/offline_batch_inference_qwen_1m.py
Lines changed: 74 additions & 0 deletions
diff --git a/‎python/sglang/srt/configs/model_config.py
Lines changed: 28 additions & 0 deletions b/‎python/sglang/srt/configs/model_config.py
Lines changed: 28 additions & 0 deletions
diff --git a/‎python/sglang/srt/disaggregation/decode_schedule_batch_mixin.py
Lines changed: 3 additions & 0 deletions b/‎python/sglang/srt/disaggregation/decode_schedule_batch_mixin.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎python/sglang/srt/hf_transformers_utils.py
Lines changed: 30 additions & 3 deletions b/‎python/sglang/srt/hf_transformers_utils.py
Lines changed: 30 additions & 3 deletions
@@ -0,0 +1,74 @@
+"""
+Usage:
+python3 offline_batch_inference.py
+"""
+
+from urllib.request import urlopen
+
+import sglang as sgl
+
+
+def load_prompt() -> str:
+    # Test cases with various lengths can be found at:
+    #
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/64k.txt
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/200k.txt
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/600k.txt
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/1m.txt
+
+    with urlopen(
+        "https://qianwen-res.oss-cn-beijing.aliyuncs.com"
+        "/Qwen2.5-1M/test-data/64k.txt",
+        timeout=5,
+    ) as response:
+        prompt = response.read().decode("utf-8")
+    return prompt
+
+
+# Processing the prompt.
+def process_requests(llm: sgl.Engine, prompts: list[str]) -> None:
+    # Create a sampling params object.
+    sampling_params = {
+        "temperature": 0.7,
+        "top_p": 0.8,
+        "top_k": 20,
+        "repetition_penalty": 1.05,
+        "max_new_tokens": 256,
+    }
+    # Generate texts from the prompts.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt_token_ids = output["meta_info"]["prompt_tokens"]
+        generated_text = output["text"]
+        print(
+            f"Prompt length: {prompt_token_ids}, " f"Generated text: {generated_text!r}"
+        )
+
+
+# Create an LLM.
+def initialize_engine() -> sgl.Engine:
+    llm = sgl.Engine(
+        model_path="Qwen/Qwen2.5-7B-Instruct-1M",
+        context_length=1048576,
+        page_size=256,
+        attention_backend="dual_chunk_flash_attn",
+        tp_size=4,
+        disable_radix_cache=True,
+        enable_mixed_chunk=False,
+        enable_torch_compile=False,
+        chunked_prefill_size=131072,
+        mem_fraction_static=0.6,
+        log_level="DEBUG",
+    )
+    return llm
+
+
+def main():
+    llm = initialize_engine()
+    prompt = load_prompt()
+    process_requests(llm, [prompt])
+
+
+if __name__ == "__main__":
+    main()
@@ -27,6 +27,7 @@
     get_context_length,
     get_generation_config,
     get_hf_text_config,
+    get_sparse_attention_config,
 )
 from sglang.srt.layers.quantization import QUANTIZATION_METHODS
 from sglang.srt.server_args import ServerArgs
@@ -270,6 +271,9 @@ def __init__(
         # Verify quantization
         self._verify_quantization()
 
+        # Verify dual-chunk attention config
+        self._verify_dual_chunk_attention_config()
+
         # Cache attributes
         self.hf_eos_token_id = self.get_hf_eos_token_id()
 
@@ -297,6 +301,13 @@ def from_server_args(server_args: ServerArgs, model_path: str = None, **kwargs):
             **kwargs,
         )
 
+    def get_total_num_attention_heads(self) -> int:
+        return self.num_attention_heads
+
+    def get_num_attention_heads(self, tensor_parallel_size) -> int:
+        total_num_attention_heads = self.num_attention_heads
+        return max(1, total_num_attention_heads // tensor_parallel_size)
+
     # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
     def get_total_num_kv_heads(self) -> int:
         """Returns the total number of KV heads."""
@@ -484,6 +495,23 @@ def _verify_quantization(self) -> None:
                     self.quantization,
                 )
 
+    def _verify_dual_chunk_attention_config(self) -> None:
+        if hasattr(self.hf_config, "dual_chunk_attention_config"):
+            # Try loading the sparse attention config
+            sparse_attn_config = get_sparse_attention_config(self.model_path)
+            if not sparse_attn_config:
+                return
+            self.hf_config.dual_chunk_attention_config["sparse_attention_config"] = (
+                sparse_attn_config
+            )
+            if (
+                "sparse_attention_enabled"
+                not in self.hf_config.dual_chunk_attention_config
+            ):
+                self.hf_config.dual_chunk_attention_config[
+                    "sparse_attention_enabled"
+                ] = True
+
     def get_hf_eos_token_id(self) -> Optional[Set[int]]:
         eos_ids = getattr(self.hf_config, "eos_token_id", None)
         if eos_ids is not None:
 
@@ -76,6 +76,9 @@ def prepare_for_prebuilt_extend(self: ScheduleBatch):
             req_pool_indices, dtype=torch.int64, device=self.device
         )
         self.seq_lens = torch.tensor(seq_lens, dtype=torch.int64, device=self.device)
+        self.orig_seq_lens = torch.tensor(
+            seq_lens, dtype=torch.int32, device=self.device
+        )
         self.out_cache_loc = out_cache_loc
         self.seq_lens_sum = sum(seq_lens)
 
 
@@ -14,10 +14,11 @@
 """Utilities for Huggingface Transformers."""
 
 import contextlib
+import json
 import os
 import warnings
 from pathlib import Path
-from typing import Dict, Optional, Type, Union
+from typing import Any, Dict, Optional, Type, Union
 
 import torch
 from huggingface_hub import snapshot_download
@@ -62,11 +63,17 @@
         AutoConfig.register(name, cls)
 
 
-def download_from_hf(model_path: str):
+def download_from_hf(
+    model_path: str,
+    allow_patterns: Optional[Union[str, list]] = None,
+):
     if os.path.exists(model_path):
         return model_path
 
-    return snapshot_download(model_path, allow_patterns=["*.json", "*.bin", "*.model"])
+    if not allow_patterns:
+        allow_patterns = ["*.json", "*.bin", "*.model"]
+
+    return snapshot_download(model_path, allow_patterns=allow_patterns)
 
 
 def get_hf_text_config(config: PretrainedConfig):
@@ -171,6 +178,26 @@ def get_generation_config(
         return None
 
 
+# Qwen-1M related
+def get_sparse_attention_config(
+    model: str,
+    sparse_attention_config_filename: str = "sparse_attention_config.json",
+) -> Dict[str, Any]:
+    is_local = os.path.isdir(model)
+    if not is_local:
+        # Download the config files.
+        model = download_from_hf(model, allow_patterns=["*.json"])
+
+    config_file = os.path.join(model, sparse_attention_config_filename)
+    if not os.path.exists(config_file):
+        return {}
+
+    # Load the sparse attention config.
+    with open(config_file) as f:
+        config = json.load(f)
+    return config
+
+
 # Models don't use the same configuration key for determining the maximum
 # context length.  Store them here so we can sanely check them.
 # NOTE: The ordering here is important. Some models have two of these and we
Original file line number	Diff line number	Diff line change
`@@ -76,6 +76,9 @@ def prepare_for_prebuilt_extend(self: ScheduleBatch):`
`76`	`76`	`req_pool_indices, dtype=torch.int64, device=self.device`
`77`	`77`	`)`
`78`	`78`	`self.seq_lens = torch.tensor(seq_lens, dtype=torch.int64, device=self.device)`
	`79`	`+ self.orig_seq_lens = torch.tensor(`
	`80`	`+ seq_lens, dtype=torch.int32, device=self.device`
	`81`	`+ )`
`79`	`82`	`self.out_cache_loc = out_cache_loc`
`80`	`83`	`self.seq_lens_sum = sum(seq_lens)`
`81`	`84`