PaddlePaddle
diff --git a/‎docs/features/structured_outputs.md
Lines changed: 62 additions & 0 deletions b/‎docs/features/structured_outputs.md
Lines changed: 62 additions & 0 deletions
diff --git a/‎docs/zh/features/structured_outputs.md
Lines changed: 64 additions & 0 deletions b/‎docs/zh/features/structured_outputs.md
Lines changed: 64 additions & 0 deletions
diff --git a/‎fastdeploy/config.py
Lines changed: 1 addition & 0 deletions b/‎fastdeploy/config.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎fastdeploy/engine/config.py
Lines changed: 8 additions & 6 deletions b/‎fastdeploy/engine/config.py
Lines changed: 8 additions & 6 deletions
diff --git a/‎fastdeploy/engine/engine.py
Lines changed: 17 additions & 3 deletions b/‎fastdeploy/engine/engine.py
Lines changed: 17 additions & 3 deletions
diff --git a/‎fastdeploy/engine/sampling_params.py
Lines changed: 47 additions & 2 deletions b/‎fastdeploy/engine/sampling_params.py
Lines changed: 47 additions & 2 deletions
diff --git a/‎fastdeploy/entrypoints/llm.py
Lines changed: 5 additions & 2 deletions b/‎fastdeploy/entrypoints/llm.py
Lines changed: 5 additions & 2 deletions
diff --git a/‎fastdeploy/input/ernie_processor.py
Lines changed: 3 additions & 5 deletions b/‎fastdeploy/input/ernie_processor.py
Lines changed: 3 additions & 5 deletions
@@ -330,3 +330,65 @@ ParsedChatCompletionMessage[Info](content='{"addr": "No.1 Century Avenue, Pudong
 Address: No.1 Century Avenue, Pudong New Area, Shanghai
 Height: 468
 ```
+
+### Offline Inference
+
+Offline inference allows restricting the model's output format by pre-specified constraints. In `FastDeploy`, constraints can be specified through the `GuidedDecodingParams` class in `SamplingParams`. `GuidedDecodingParams` supports the following constraint types, with usage similar to online inference:
+
+```python
+json: Optional[Union[str, dict]] = None
+regex: Optional[str] = None
+choice: Optional[List[str]] = None
+grammar: Optional[str] = None
+json_object: Optional[bool] = None
+structural_tag: Optional[str] = None
+```
+
+The following example demonstrates how to use offline inference to generate a structured json:
+
+```python
+from fastdeploy import LLM, SamplingParams
+from fastdeploy.engine.sampling_params import GuidedDecodingParams
+from pydantic import BaseModel
+from enum import Enum
+
+class BookType(str, Enum):
+    romance = "Romance"
+    historical = "Historical"
+    adventure = "Adventure"
+    mystery = "Mystery"
+    dystopian = "Dystopian"
+
+class BookDescription(BaseModel):
+    author: str
+    title: str
+    genre: BookType
+
+# Constrained decoding parameters
+guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema())
+
+# Sampling parameters
+sampling_params = SamplingParams(
+    top_p=0.95,
+    max_tokens=6400,
+    guided_decoding=guided_decoding_params,
+)
+
+# Load model
+llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192, guided_decoding_backend="auto")
+
+outputs = llm.generate(
+    prompts="Generate a JSON describing a literary work, including author, title and book type.",
+    sampling_params=sampling_params,
+)
+
+# Output results
+for output in outputs:
+    print(output.outputs.text)
+```
+
+Output:
+
+```
+{"author": "George Orwell", "title": "1984", "genre": "Dystopian"}
+```
@@ -330,3 +330,67 @@ ParsedChatCompletionMessage[Info](content='{"addr": "上海市浦东新区世纪
 地址: 上海市浦东新区世纪大道1号
 高度: 468
 ```
+
+### 离线推理
+
+离线推理允许通过预先指定约束条件，限制模型输出格式。在 `FastDeploy` 中，支持通过 `SamplingParams` 中的 `GuidedDecodingParams` 类指定相关约束条件。`GuidedDecodingParams` 支持以下几种约束条件，使用方式可以参考在线推理：
+
+```python
+json: Optional[Union[str, dict]] = None
+regex: Optional[str] = None
+choice: Optional[List[str]] = None
+grammar: Optional[str] = None
+json_object: Optional[bool] = None
+structural_tag: Optional[str] = None
+```
+
+以下示例展示了如何使用离线推理生成一个结构化的 json :
+
+```python
+
+from fastdeploy import LLM, SamplingParams
+from fastdeploy.engine.sampling_params import GuidedDecodingParams
+from pydantic import BaseModel
+from enum import Enum
+
+class BookType(str, Enum):
+    romance = "Romance"
+    historical = "Historical"
+    adventure = "Adventure"
+    mystery = "Mystery"
+    dystopian = "Dystopian"
+
+class BookDescription(BaseModel):
+    author: str
+    title: str
+    genre: BookType
+
+# Constrained decoding parameters
+guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema())
+
+# Sampling parameters
+sampling_params = SamplingParams(
+    top_p=0.95,
+    max_tokens=6400,
+    guided_decoding=guided_decoding_params,
+)
+
+# Load model
+llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192, guided_decoding_backend="auto")
+
+outputs = llm.generate(
+    prompts="生成一个JSON，描述一本中国的著作，要包含作者、标题和书籍类型。",
+    sampling_params=sampling_params,
+)
+
+# Output results
+for output in outputs:
+    print(output.outputs.text)
+
+```
+
+输出
+
+```
+{"author": "曹雪芹", "title": "红楼梦", "genre": "Historical"}
+```
@@ -83,6 +83,7 @@ def __init__(
         self.dtype = ""
         self.enable_logprob = False
         self.enable_mm = False
+        self.reasoning_parser = None
 
         for key, value in args.items():
             if hasattr(self, key):
 
@@ -24,7 +24,7 @@
 from fastdeploy.platforms import current_platform
 from fastdeploy.scheduler import SchedulerConfig
 from fastdeploy.utils import (ceil_div, check_unified_ckpt, get_host_ip,
-                              is_port_available, get_random_port, llm_logger)
+                              get_random_port, is_port_available, llm_logger)
 
 TaskOption = Literal["generate"]
 
@@ -701,7 +701,7 @@ def __init__(
         self.max_num_batched_tokens = max_num_batched_tokens
         self.tensor_parallel_size = tensor_parallel_size
         self.dist_init_ip = dist_init_ip
-        
+
         self.nnode = nnodes
         self.node_rank = node_rank
         if self.dist_init_ip is None:
@@ -805,7 +805,8 @@ def postprocess(self):
             self.max_model_len // self.cache_config.block_size)
 
         if self.guided_decoding_backend == "auto":
-            if self.enable_mm:
+            if current_platform.is_xpu() or self.speculative_config.method is not None:
+                llm_logger.warning("Speculative Decoding and XPU currently do not support Guided decoding, set off.")
                 self.guided_decoding_backend = "off"
             else:
                 self.guided_decoding_backend = "xgrammar"
@@ -872,10 +873,10 @@ def check(self):
                 f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}."
 
             if self.guided_decoding_backend != "off":
-                # TODO: mm support guided_decoding
-                assert self.enable_mm is False, "Multimodal model currently do not support guided_decoding"
 
                 # TODO: speculative decoding support guided_decoding
+                assert self.speculative_config.method is None, \
+                "speculative decoding currently do not support guided_decoding"
 
                 # TODO: xpu support guided_decoding
                 assert not current_platform.is_xpu(
@@ -907,7 +908,8 @@ def print(self, file=None):
                   k == "model_config" or
                   k == "scheduler_config" or
                   k == "parallel_config" or
-                  k == "commit_config"):
+                  k == "commit_config" or
+                  k == "speculative_config"):
                 v.print()
             else:
                 llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
 
@@ -363,10 +363,16 @@ def _insert_zmq_task_to_scheduler(self):
                     request = Request.from_dict(data)
                     start_span("ENQUEUE_ZMQ", data, trace.SpanKind.PRODUCER)
 
-
                     llm_logger.debug(f"Receive request: {request}")
 
                     err_msg = None
+                    if ((request.guided_json is not None
+                    or request.guided_regex is not None
+                    or request.structural_tag is not None
+                    or request.guided_grammar is not None) and self.guided_decoding_checker is None):
+                        err_msg = "guided_backend is None, use --guided-decoding-backend to " \
+                                  "specify the backend at server startup."
+
                     if self.guided_decoding_checker is not None:
                         request, err_msg = self.guided_decoding_checker.schema_format(
                             request)
@@ -455,6 +461,14 @@ def add_requests(self, task, sampling_params=None, **kwargs):
             llm_logger.error(error_msg)
             raise EngineError(error_msg, error_code=400)
 
+        if ((request.guided_json is not None
+        or request.guided_regex is not None
+        or request.structural_tag is not None
+        or request.guided_grammar is not None) and self.guided_decoding_checker is None):
+            err_msg = "guided_backend is None, use --guided-decoding-backend to specify the backend at server startup."
+            llm_logger.error(err_msg)
+            raise EngineError(err_msg, error_code=400)
+
         if self.guided_decoding_checker is not None:
             request, err_msg = self.guided_decoding_checker.schema_format(
                 request)
@@ -1036,8 +1050,8 @@ def _start_worker_service(self):
             f" --speculative_benchmark_mode {self.cfg.speculative_config.benchmark_mode}"
             f" --graph_optimization_config '{self.cfg.graph_optimization_config.to_json_string()}'"
             f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
-            f" --load_strategy {self.cfg.model_config.load_strategy}")
-
+            f" --load_strategy {self.cfg.model_config.load_strategy}"
+            f" --reasoning_parser {self.cfg.reasoning_parser}")
 
         worker_append_flag = {
             "enable_expert_parallel":
 
@@ -92,6 +92,7 @@ class SamplingParams:
     min_tokens: int = 1
     logprobs: Optional[int] = None
     bad_words: Optional[List[str]] = None
+    guided_decoding: Optional[GuidedDecodingParams] = None
 
     @classmethod
     def from_dict(cls, req_dict: dict[str, Any]) -> "SamplingParams":
@@ -121,7 +122,8 @@ def from_optional(cls,
                       reasoning_max_tokens=None,
                       min_tokens=1,
                       logprobs=None,
-                      bad_words=None) -> "SamplingParams":
+                      bad_words=None,
+                      guided_decoding=None) -> "SamplingParams":
         """Create instance from command line arguments"""
         return cls(n=1 if n is None else n,
                    best_of=best_of,
@@ -141,7 +143,8 @@ def from_optional(cls,
                    reasoning_max_tokens=reasoning_max_tokens,
                    min_tokens=min_tokens,
                    logprobs=logprobs,
-                   bad_words=bad_words)
+                   bad_words=bad_words,
+                   guided_decoding=guided_decoding)
 
     def __post_init__(self):
         if self.seed is None:
@@ -224,3 +227,45 @@ class BeamSearchParams:
     temperature: float = 0.0
     length_penalty: float = 1.0
     include_stop_str_in_output: bool = False
+
+
+@dataclass
+class GuidedDecodingParams:
+    """Guided decoding parameters for text generation."""
+    json: Optional[Union[str, dict]] = None
+    regex: Optional[str] = None
+    choice: Optional[List[str]] = None
+    grammar: Optional[str] = None
+    json_object: Optional[bool] = None
+    structural_tag: Optional[str] = None
+
+    def to_dict(self):
+        """convert to dict"""
+        key_dict = {
+            "guided_json": self.json,
+            "guided_regex": self.regex,
+            "guided_choice": self.choice,
+            "guided_grammar": self.grammar,
+            "structural_tag": self.structural_tag,
+            "guided_json_object": self.json_object,
+        }
+
+        guided_dict = {}
+        for key, value in key_dict.items():
+            if value is not None:
+                guided_dict[key] = value
+        return guided_dict
+
+    def __post_init__(self):
+        """Verify the arguments."""
+        guided_count = sum([
+            self.json is not None, self.regex is not None, self.choice
+            is not None, self.grammar is not None, self.json_object
+            is not None, self.structural_tag is not None
+        ])
+
+        if guided_count > 1:
+            raise ValueError(
+                "You can only use one kind of guided decoding "
+                "('json', 'json_object', 'regex', 'choice', 'grammar', 'structural_tag')."
+            )
@@ -89,7 +89,7 @@ def __init__(
         self._receive_output_thread = threading.Thread(
             target=self._receive_output, daemon=True)
         self._receive_output_thread.start()
-    
+
     def _check_master(self):
         """
         Check if the current node is the master node.
@@ -198,7 +198,7 @@ def chat(
         if not self._check_master():
             err_msg = f"Only master node can accept completion request, please send request to master node: {self.master_node_ip}"
             raise ValueError(err_msg)
-        
+
         if sampling_params is None:
             sampling_params = self.default_sampling_params
 
@@ -275,6 +275,9 @@ def _add_request(
             if chat_template_kwargs is not None:
                 enable_thinking = chat_template_kwargs.get(
                     "enable_thinking", None)
+            if current_sampling_params.guided_decoding is not None:
+                guided_decoding_dict = current_sampling_params.guided_decoding.to_dict()
+                tasks.update(guided_decoding_dict)
             self.llm_engine.add_requests(tasks,
                                          current_sampling_params,
                                          enable_thinking=enable_thinking)
 
@@ -60,7 +60,6 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None):
         self.eos_token_ids = [self.tokenizer.eos_token_id]
         self.eos_token_id_len = len(self.eos_token_ids)
         self.pad_token_id = self.get_pad_id()
-        self.reasoning_parser = None
         if reasoning_parser_obj:
             self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
 
@@ -100,7 +99,6 @@ def process_request(self, request, max_model_len=None, **kwargs):
 
         if request.prompt_token_ids is None or len(
                 request.prompt_token_ids) == 0:
-            system = request.get("system")
             if request.prompt is None and request.messages is None:
                 raise ValueError(
                     f"The request should have `input_ids`, `text` or `messages`: {request}.")
@@ -149,7 +147,6 @@ def process_request_dict(self, request, max_model_len=None):
             request['stop_token_ids'] = stop_seqs
             request['stop_seqs_len'] = stop_seqs_len
 
-        system = request.get("system")
         # 处理prompt_token_ids
         if not request.get('prompt_token_ids'):
             if request.get('prompt') is None and request.get(
@@ -213,7 +210,7 @@ def process_response(self, response_dict, **kwargs):
             response_dict.outputs.reasoning_content = reasoning_content
         else:
             response_dict.outputs.text = full_text
-        data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
+        data_processor_logger.info(f"req_id:{req_id}, token ids: {token_ids}")
         if response_dict.outputs.text == "" and \
                 response_dict.outputs.reasoning_content == "":
             return None
@@ -278,7 +275,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
         Returns:
             Dict: response contain text fields
         """
-        enable_thinking = kwargs.get("enable_thinking")
         is_end = response_dict["finished"]
         req_id = response_dict["request_id"]
         token_ids = response_dict["outputs"]["token_ids"]
@@ -288,6 +284,8 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
                 token_ids = token_ids[:-1]
         delta_text, previous_token_ids, previous_texts = self.ids2tokens(
             token_ids, req_id)
+
+        enable_thinking = self.get_enable_thinking(kwargs.get("enable_thinking"))
         if enable_thinking and self.reasoning_parser:
             reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming(
                 previous_texts, previous_texts + delta_text, delta_text,