PaddlePaddle · LiqinruiG · Aug 18, 2025 · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025
diff --git a/docs/online_serving/README.md b/docs/online_serving/README.md
@@ -161,6 +161,9 @@ The following extra parameters are supported:
 chat_template_kwargs: Optional[dict] = None
 # Additional parameters passed to the chat template, used for customizing dialogue formats (default None).
 
+chat_template: Optional[str] = None
+# Custom chat template will override the model's default chat template (default None).
+
 reasoning_max_tokens: Optional[int] = None
 # Maximum number of tokens to generate during reasoning (e.g., CoT, chain of thought) (default None means using global max_tokens).
 

diff --git a/docs/parameters.md b/docs/parameters.md
@@ -46,6 +46,7 @@ When using FastDeploy to deploy models (including offline inference and service
 | ```dynamic_load_weight``` | `int` | Whether to enable dynamic weight loading, default: 0 |
 | ```enable_expert_parallel``` | `bool` | Whether to enable expert parallel |
 | ```enable_logprob``` | `bool` | Whether to enable return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message.If logrpob is not used, this parameter can be omitted when starting |
+| ```chat_template``` | `str` | Specify the template used for model concatenation, It supports both string input and file path input. The default value is None. If not specified, the model's default template will be used. |
 
 ## 1. Relationship between KVCache allocation, ```num_gpu_blocks_override``` and ```block_size```?
 

diff --git a/docs/zh/online_serving/README.md b/docs/zh/online_serving/README.md
@@ -160,6 +160,9 @@ repetition_penalty: Optional[float] = None
 chat_template_kwargs: Optional[dict] = None
 # 传递给聊天模板（chat template）的额外参数，用于自定义对话格式（默认 None）。
 
+chat_template: Optional[str] = None
+# 自定义聊天模板，会覆盖模型默认的聊天模板，（默认 None）。
+
 reasoning_max_tokens: Optional[int] = None
 # 推理（如 CoT, 思维链）过程中生成的最大 token 数（默认 None 表示使用全局 max_tokens）。
 

diff --git a/docs/zh/parameters.md b/docs/zh/parameters.md
@@ -44,6 +44,7 @@
 | ```dynamic_load_weight```          | `int`       | 是否动态加载权重，默认0 |
 | ```enable_expert_parallel```       | `bool`      | 是否启用专家并行 |
 | ```enable_logprob```       | `bool`      | 是否启用输出token返回logprob。如果未使用 logrpob，则在启动时可以省略此参数。 |
+| ```chat_template```       | `str`      | 指定模型拼接使用的模板，支持字符串与文件路径，默认为None，如未指定，则使用模型默认模板 |
 
 ## 1. KVCache分配与```num_gpu_blocks_override```、```block_size```的关系？
 

diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
@@ -94,6 +94,10 @@ class EngineArgs:
     """
     specifies the reasoning parser to use for extracting reasoning content from the model output
     """
+    chat_template: str = None
+    """
+    chat template or chat template file path
+    """
     tool_call_parser: str = None
     """
     specifies the tool call parser  to use for extracting tool call from the model output
@@ -442,6 +446,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             help="Flag specifies the reasoning parser to use for extracting "
             "reasoning content from the model output",
         )
+        model_group.add_argument(
+            "--chat-template",
+            type=str,
+            default=EngineArgs.chat_template,
+            help="chat template or chat template file path",
+        )
         model_group.add_argument(
             "--tool-call-parser",
             type=str,

diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py
@@ -72,6 +72,7 @@ def __init__(
         guided_json_object: Optional[bool] = None,
         enable_thinking: Optional[bool] = True,
         trace_carrier: dict = dict(),
+        chat_template: Optional[str] = None,
     ) -> None:
         self.request_id = request_id
         self.prompt = prompt
@@ -111,6 +112,8 @@ def __init__(
         self.enable_thinking = enable_thinking
         self.trace_carrier = trace_carrier
 
+        self.chat_template = chat_template
+
         # token num
         self.block_tables = []
         self.output_token_ids = []
@@ -152,6 +155,7 @@ def from_dict(cls, d: dict):
             guided_json_object=d.get("guided_json_object", None),
             enable_thinking=d.get("enable_thinking", True),
             trace_carrier=d.get("trace_carrier", {}),
+            chat_template=d.get("chat_template", None),
         )
 
     @property
@@ -191,6 +195,7 @@ def to_dict(self) -> dict:
             "draft_token_ids": self.draft_token_ids,
             "enable_thinking": self.enable_thinking,
             "trace_carrier": self.trace_carrier,
+            "chat_template": self.chat_template,
         }
         add_params = [
             "guided_json",

diff --git a/fastdeploy/entrypoints/chat_utils.py b/fastdeploy/entrypoints/chat_utils.py
@@ -16,7 +16,8 @@
 
 import uuid
 from copy import deepcopy
-from typing import List, Literal, Union
+from pathlib import Path
+from typing import List, Literal, Optional, Union
 from urllib.parse import urlparse
 
 import requests
@@ -159,5 +160,37 @@ def parse_chat_messages(messages):
     return conversation
 
 
+def load_chat_template(
+    chat_template: Union[Path, str],
+    is_literal: bool = False,
+) -> Optional[str]:
+    if chat_template is None:
+        return None
+    if is_literal:
+        if isinstance(chat_template, Path):
+            raise TypeError("chat_template is expected to be read directly " "from its value")
+
+        return chat_template
+
+    try:
+        with open(chat_template) as f:
+            return f.read()
+    except OSError as e:
+        if isinstance(chat_template, Path):
+            raise
+        JINJA_CHARS = "{}\n"
+        if not any(c in chat_template for c in JINJA_CHARS):
+            msg = (
+                f"The supplied chat template ({chat_template}) "
+                f"looks like a file path, but it failed to be "
+                f"opened. Reason: {e}"
+            )
+            raise ValueError(msg) from e
+
+        # If opening a file fails, set chat template to be args to
+        # ensure we decode so our escape are interpreted correctly
+        return load_chat_template(chat_template, is_literal=True)
+
+
 def random_tool_call_id() -> str:
     return f"chatcmpl-tool-{str(uuid.uuid4().hex)}"
diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py
@@ -28,6 +28,7 @@
 from fastdeploy.engine.args_utils import EngineArgs
 from fastdeploy.engine.engine import LLMEngine
 from fastdeploy.engine.sampling_params import SamplingParams
+from fastdeploy.entrypoints.chat_utils import load_chat_template
 from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager
 from fastdeploy.plugins.model_register import load_model_register_plugins
 from fastdeploy.utils import (
@@ -74,6 +75,7 @@ def __init__(
         revision: Optional[str] = "master",
         tokenizer: Optional[str] = None,
         enable_logprob: Optional[bool] = False,
+        chat_template: Optional[str] = None,
         **kwargs,
     ):
         deprecated_kwargs_warning(**kwargs)
@@ -102,6 +104,7 @@ def __init__(
         self.master_node_ip = self.llm_engine.cfg.master_ip
         self._receive_output_thread = threading.Thread(target=self._receive_output, daemon=True)
         self._receive_output_thread.start()
+        self.chat_template = load_chat_template(chat_template)
 
     def _check_master(self):
         """
@@ -196,6 +199,7 @@ def chat(
         sampling_params: Optional[Union[SamplingParams, list[SamplingParams]]] = None,
         use_tqdm: bool = True,
         chat_template_kwargs: Optional[dict[str, Any]] = None,
+        chat_template: Optional[str] = None,
     ):
         """
         Args:
@@ -229,13 +233,17 @@ def chat(
         if sampling_params_len != 1 and len(messages) != sampling_params_len:
             raise ValueError("messages and sampling_params must be the same length.")
 
+        if chat_template is None:
+            chat_template = self.chat_template
+
         messages_len = len(messages)
         for i in range(messages_len):
             messages[i] = {"messages": messages[i]}
         req_ids = self._add_request(
             prompts=messages,
             sampling_params=sampling_params,
             chat_template_kwargs=chat_template_kwargs,
+            chat_template=chat_template,
         )
 
         topk_logprobs = sampling_params[0].logprobs if sampling_params_len > 1 else sampling_params.logprobs

diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py
@@ -30,6 +30,7 @@
 
 from fastdeploy.engine.args_utils import EngineArgs
 from fastdeploy.engine.engine import LLMEngine
+from fastdeploy.entrypoints.chat_utils import load_chat_template
 from fastdeploy.entrypoints.engine_client import EngineClient
 from fastdeploy.entrypoints.openai.protocol import (
     ChatCompletionRequest,
@@ -75,6 +76,7 @@
 parser = EngineArgs.add_cli_args(parser)
 args = parser.parse_args()
 args.model = retrive_model_from_server(args.model, args.revision)
+chat_template = load_chat_template(args.chat_template)
 if args.tool_parser_plugin:
     ToolParserManager.import_tool_parser(args.tool_parser_plugin)
 llm_engine = None
@@ -139,7 +141,7 @@ async def lifespan(app: FastAPI):
         args.tool_call_parser,
     )
     app.state.dynamic_load_weight = args.dynamic_load_weight
-    chat_handler = OpenAIServingChat(engine_client, pid, args.ips, args.max_waiting_time)
+    chat_handler = OpenAIServingChat(engine_client, pid, args.ips, args.max_waiting_time, chat_template)
     completion_handler = OpenAIServingCompletion(engine_client, pid, args.ips, args.max_waiting_time)
     engine_client.create_zmq_client(model=pid, mode=zmq.PUSH)
     engine_client.pid = pid

diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py
@@ -524,6 +524,7 @@ class ChatCompletionRequest(BaseModel):
 
     # doc: start-completion-extra-params
     chat_template_kwargs: Optional[dict] = None
+    chat_template: Optional[str] = None
     reasoning_max_tokens: Optional[int] = None
     structural_tag: Optional[str] = None
     guided_json: Optional[Union[str, dict, BaseModel]] = None

diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -49,12 +49,13 @@ class OpenAIServingChat:
     OpenAI-style chat completions serving
     """
 
-    def __init__(self, engine_client, pid, ips, max_waiting_time):
+    def __init__(self, engine_client, pid, ips, max_waiting_time, chat_template):
         self.engine_client = engine_client
         self.pid = pid
         self.master_ip = ips
         self.max_waiting_time = max_waiting_time
         self.host_ip = get_host_ip()
+        self.chat_template = chat_template
         if self.master_ip is not None:
             if isinstance(self.master_ip, list):
                 self.master_ip = self.master_ip[0]
@@ -86,6 +87,8 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
         text_after_process = None
         try:
             current_req_dict = request.to_dict_for_infer(request_id)
+            if "chat_template" not in current_req_dict:
+                current_req_dict["chat_template"] = self.chat_template
             current_req_dict["arrival_time"] = time.time()
             prompt_token_ids = self.engine_client.format_and_add_data(current_req_dict)
             text_after_process = current_req_dict.get("text_after_process")

diff --git a/fastdeploy/input/ernie_processor.py b/fastdeploy/input/ernie_processor.py
@@ -87,6 +87,7 @@ def process_request(self, request, max_model_len=None, **kwargs):
             bool: Whether preprocessing is successful
             str: error message
         """
+        request.chat_template = kwargs.get("chat_template")
         request = self._apply_default_parameters(request)
         if request.get("eos_token_ids") is None or len(request.eos_token_ids) == 0:
             request.eos_token_ids = self.eos_token_ids
@@ -342,6 +343,7 @@ def messages2ids(self, request_or_messages):
             tokenize=False,
             split_special_tokens=False,
             add_special_tokens=False,
+            chat_template=request_or_messages.get("chat_template", None),
         )
         request_or_messages["text_after_process"] = spliced_message
         req_id = None

diff --git a/fastdeploy/input/ernie_vl_processor.py b/fastdeploy/input/ernie_vl_processor.py
@@ -109,6 +109,7 @@ def set_value(req, key, value):
 
     def process_request(self, request, max_model_len=None, **kwargs):
         """process the input data"""
+        request.chat_template = kwargs.get("chat_template")
         task = request.to_dict()
         task["enable_thinking"] = kwargs.get("enable_thinking", True)
         self.process_request_dict(task, max_model_len)

diff --git a/fastdeploy/input/mm_processor/process.py b/fastdeploy/input/mm_processor/process.py
@@ -494,10 +494,12 @@ def apply_chat_template(self, request):
         """
         if self.tokenizer.chat_template is None:
             raise ValueError("This model does not support chat_template.")
+
         prompt_token_template = self.tokenizer.apply_chat_template(
             request,
             tokenize=False,
             add_generation_prompt=request.get("add_generation_prompt", True),
+            chat_template=request.get("chat_template", None),
         )
         prompt_token_str = prompt_token_template.replace("<|image@placeholder|>", "").replace(
             "<|video@placeholder|>", ""

diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py
@@ -204,6 +204,7 @@ def process_request(self, request, max_model_len=None, **kwargs):
             bool: Whether preprocessing is successful
             str: error message
         """
+        request.chat_template = kwargs.get("chat_template")
         request = self._apply_default_parameters(request)
         if request.get("eos_token_ids") is None or len(request.eos_token_ids) == 0:
             request.eos_token_ids = self.eos_token_ids
@@ -486,6 +487,7 @@ def messages2ids(self, request):
             split_special_tokens=False,
             add_special_tokens=False,
             return_tensors="pd",
+            chat_template=request.get("chat_template", None),
         )
         request["text_after_process"] = spliced_message
         req_id = None

diff --git a/requirements_dcu.txt b/requirements_dcu.txt
@@ -35,3 +35,4 @@ opentelemetry-instrumentation-mysql
 opentelemetry-distro 
 opentelemetry-exporter-otlp
 opentelemetry-instrumentation-fastapi
+partial_json_parser
diff --git a/requirements_iluvatar.txt b/requirements_iluvatar.txt
@@ -36,3 +36,4 @@ opentelemetry-instrumentation-mysql
 opentelemetry-distro
 opentelemetry-exporter-otlp
 opentelemetry-instrumentation-fastapi
+partial_json_parser
diff --git a/requirements_metaxgpu.txt b/requirements_metaxgpu.txt
@@ -37,3 +37,4 @@ opentelemetry-instrumentation-mysql
 opentelemetry-distro 
 opentelemetry-exporter-otlp
 opentelemetry-instrumentation-fastapi
+partial_json_parser