fix

luukunn · luukunn · commit b77da0384850 · 2025-08-11T16:51:18.000+08:00
diff --git a/docs/parameters.md b/docs/parameters.md
@@ -44,6 +44,7 @@ When using FastDeploy to deploy models (including offline inference and service
 | ```dynamic_load_weight``` | `int` | Whether to enable dynamic weight loading, default: 0 |
 | ```enable_expert_parallel``` | `bool` | Whether to enable expert parallel |
 | ```enable_logprob``` | `bool` | Whether to enable return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message.If logrpob is not used, this parameter can be omitted when starting |
+| ```chat_template``` | `str` | Specify the template used for model concatenation, It supports both string input and file path input. The default value is None. If not specified, the model's default template will be used. |
 
 ## 1. Relationship between KVCache allocation, ```num_gpu_blocks_override``` and ```block_size```?
 
diff --git a/docs/zh/parameters.md b/docs/zh/parameters.md
@@ -42,6 +42,7 @@
 | ```dynamic_load_weight```          | `int`       | 是否动态加载权重，默认0 |
 | ```enable_expert_parallel```       | `bool`      | 是否启用专家并行 |
 | ```enable_logprob```       | `bool`      | 是否启用输出token返回logprob。如果未使用 logrpob，则在启动时可以省略此参数。 |
+| ```chat_template```       | `str`      | 指定模型拼接使用的模板，支持字符串与文件路径，默认为None，如未指定，则使用模型默认模板 |
 
 ## 1. KVCache分配与```num_gpu_blocks_override```、```block_size```的关系？
 
diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py
@@ -497,10 +497,7 @@ def add_requests(self, task, sampling_params=None, **kwargs):
             request.sampling_params = sampling_params
         request.preprocess_start_time = time.time()
 
-        enable_thinking = None
-        if kwargs is not None:
-            enable_thinking = kwargs.get("enable_thinking", None)
-        request = self.data_processor.process_request(request, self.cfg.max_model_len, enable_thinking=enable_thinking)
+        request = self.data_processor.process_request(request, self.cfg.max_model_len, **kwargs)
         request.prompt_token_ids_len = len(request.prompt_token_ids)
         request.need_prefill_tokens = request.prompt_token_ids_len
         input_ids_len = request.prompt_token_ids_len
diff --git a/fastdeploy/entrypoints/chat_utils.py b/fastdeploy/entrypoints/chat_utils.py
@@ -15,7 +15,8 @@
 """
 
 from copy import deepcopy
-from typing import List, Literal, Union, Optional
+from pathlib import Path
+from typing import List, Literal, Optional, Union
 from urllib.parse import urlparse
 
 import requests
@@ -29,7 +30,6 @@
 
 from fastdeploy.multimodal.image import ImageMediaIO
 from fastdeploy.multimodal.video import VideoMediaIO
-from pathlib import Path
 
 
 class VideoURL(TypedDict, total=False):
@@ -158,16 +158,19 @@ def parse_chat_messages(messages):
         conversation.append({"role": role, "content": parsed_content})
     return conversation
 
-def load_chat_template(chat_template: Union[Path, str], is_literal: bool = False,) -> Optional[str]:
+
+def load_chat_template(
+    chat_template: Union[Path, str],
+    is_literal: bool = False,
+) -> Optional[str]:
     if chat_template is None:
         return None
     if is_literal:
         if isinstance(chat_template, Path):
-            raise TypeError("chat_template is expected to be read directly "
-                            "from its value")
+            raise TypeError("chat_template is expected to be read directly " "from its value")
 
         return chat_template
-    
+
     try:
         with open(chat_template) as f:
             return f.read()
@@ -176,11 +179,13 @@ def load_chat_template(chat_template: Union[Path, str], is_literal: bool = False
             raise
         JINJA_CHARS = "{}\n"
         if not any(c in chat_template for c in JINJA_CHARS):
-            msg = (f"The supplied chat template ({chat_template}) "
-                   f"looks like a file path, but it failed to be "
-                   f"opened. Reason: {e}")
+            msg = (
+                f"The supplied chat template ({chat_template}) "
+                f"looks like a file path, but it failed to be "
+                f"opened. Reason: {e}"
+            )
             raise ValueError(msg) from e
 
         # If opening a file fails, set chat template to be args to
         # ensure we decode so our escape are interpreted correctly
-        return load_chat_template(chat_template, is_literal=True)
+        return load_chat_template(chat_template, is_literal=True)
diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py
@@ -28,14 +28,14 @@
 from fastdeploy.engine.args_utils import EngineArgs
 from fastdeploy.engine.engine import LLMEngine
 from fastdeploy.engine.sampling_params import SamplingParams
+from fastdeploy.entrypoints.chat_utils import load_chat_template
 from fastdeploy.plugins.model_register import load_model_register_plugins
 from fastdeploy.utils import (
     deprecated_kwargs_warning,
     llm_logger,
     retrive_model_from_server,
 )
 from fastdeploy.worker.output import Logprob, LogprobsLists
-from fastdeploy.entrypoints.chat_utils import load_chat_template
 
 root_logger = logging.getLogger()
 for handler in root_logger.handlers[:]:
@@ -228,17 +228,18 @@ def chat(
 
         if sampling_params_len != 1 and len(messages) != sampling_params_len:
             raise ValueError("messages and sampling_params must be the same length.")
-        
+
         if chat_template is None:
             chat_template = self.chat_template
 
         messages_len = len(messages)
         for i in range(messages_len):
-            messages[i] = {"messages": messages[i], "chat_template": chat_template}
+            messages[i] = {"messages": messages[i]}
         req_ids = self._add_request(
             prompts=messages,
             sampling_params=sampling_params,
             chat_template_kwargs=chat_template_kwargs,
+            chat_template=chat_template,
         )
 
         topk_logprobs = sampling_params[0].logprobs if sampling_params_len > 1 else sampling_params.logprobs
@@ -251,7 +252,7 @@ def _add_request(
         self,
         prompts,
         sampling_params,
-        chat_template_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
     ):
         """
             添加一个请求到 LLM Engine，并返回该请求的 ID。
@@ -292,10 +293,7 @@ def _add_request(
                 current_sampling_params = sampling_params[i]
             else:
                 current_sampling_params = sampling_params
-            enable_thinking = None
-            if chat_template_kwargs is not None:
-                enable_thinking = chat_template_kwargs.get("enable_thinking", None)
-            self.llm_engine.add_requests(tasks, current_sampling_params, enable_thinking=enable_thinking)
+            self.llm_engine.add_requests(tasks, current_sampling_params, **kwargs)
         return req_ids
 
     def _decode_token(self, token_id: int) -> str:
diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py
@@ -28,6 +28,7 @@
 
 from fastdeploy.engine.args_utils import EngineArgs
 from fastdeploy.engine.engine import LLMEngine
+from fastdeploy.entrypoints.chat_utils import load_chat_template
 from fastdeploy.entrypoints.engine_client import EngineClient
 from fastdeploy.entrypoints.openai.protocol import (
     ChatCompletionRequest,
@@ -54,7 +55,6 @@
     is_port_available,
     retrive_model_from_server,
 )
-from fastdeploy.entrypoints.chat_utils import load_chat_template
 
 parser = FlexibleArgumentParser()
 parser.add_argument("--port", default=8000, type=int, help="port to the http server")
@@ -65,6 +65,7 @@
 parser = EngineArgs.add_cli_args(parser)
 args = parser.parse_args()
 args.model = retrive_model_from_server(args.model, args.revision)
+chat_template = load_chat_template(args.chat_template)
 
 llm_engine = None
 
@@ -105,7 +106,6 @@ async def lifespan(app: FastAPI):
         pid = os.getppid()
     else:
         pid = os.getpid()
-    chat_template = load_chat_template(args.chat_template)
     api_server_logger.info(f"{pid}")
     engine_client = EngineClient(
         args.model,
diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -85,9 +85,9 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
         api_server_logger.info(f"create chat completion request: {request_id}")
 
         try:
-            if request.chat_template is None:
-                request.chat_template = self.chat_template
             current_req_dict = request.to_dict_for_infer(request_id)
+            if "chat_template" not in current_req_dict:
+                current_req_dict["chat_template"] = self.chat_template
             current_req_dict["arrival_time"] = time.time()
             prompt_token_ids = self.engine_client.format_and_add_data(current_req_dict)
             if isinstance(prompt_token_ids, np.ndarray):
diff --git a/fastdeploy/input/ernie_processor.py b/fastdeploy/input/ernie_processor.py
@@ -88,6 +88,7 @@ def process_request(self, request, max_model_len=None, **kwargs):
             bool: Whether preprocessing is successful
             str: error message
         """
+        request.chat_template = kwargs.get("chat_template")
         request = self._apply_default_parameters(request)
         if request.get("eos_token_ids") is None or len(request.eos_token_ids) == 0:
             request.eos_token_ids = self.eos_token_ids
@@ -139,6 +140,7 @@ def process_request_dict(self, request, max_model_len=None):
             str: error message
         """
         request = self._apply_default_parameters(request)
+        request["enable_thinking"] = request.get("chat_template_kwargs", {}).get("enable_thinking")
         if not request.get("eos_token_ids"):
             request["eos_token_ids"] = self.eos_token_ids
 
@@ -309,7 +311,7 @@ def messages2ids(self, request_or_messages):
             tokenize=False,
             split_special_tokens=False,
             add_special_tokens=False,
-            chat_template=request_or_messages.get("chat_template", None)
+            chat_template=request_or_messages.get("chat_template", None),
         )
 
         req_id = None
diff --git a/fastdeploy/input/ernie_vl_processor.py b/fastdeploy/input/ernie_vl_processor.py
@@ -108,8 +108,9 @@ def set_value(req, key, value):
 
     def process_request(self, request, max_model_len=None, **kwargs):
         """process the input data"""
+        request.chat_template = kwargs.get("chat_template")
         task = request.to_dict()
-        task["enable_thinking"] = kwargs.get("enable_thinking", True)
+        task["enable_thinking"] = task.get("chat_template_kwargs", {}).get("enable_thinking", True)
         self.process_request_dict(task, max_model_len)
         request = Request.from_dict(task)
         request = self._apply_default_parameters(request)
diff --git a/fastdeploy/input/mm_processor/process.py b/fastdeploy/input/mm_processor/process.py
@@ -501,7 +501,6 @@ def apply_chat_template(self, request):
                 tokenize=False,
                 add_generation_prompt=request.get("add_generation_prompt", True),
                 chat_template=request.get("chat_template", None),
-
             )
             .replace("<|image@placeholder|>", "")
             .replace("<|video@placeholder|>", "")
diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py
@@ -219,6 +219,7 @@ def process_request(self, request, max_model_len=None, **kwargs):
             bool: Whether preprocessing is successful
             str: error message
         """
+        request.chat_template = kwargs.get("chat_template")
         request = self._apply_default_parameters(request)
         if request.get("eos_token_ids") is None or len(request.eos_token_ids) == 0:
             request.eos_token_ids = self.eos_token_ids
@@ -267,6 +268,7 @@ def process_request_dict(self, request, max_model_len=None, **kwargs):
             str: error message
         """
         request = self._apply_default_parameters(request)
+        request["enable_thinking"] = request.get("chat_template_kwargs", {}).get("enable_thinking")
         if not request.get("eos_token_ids"):
             request["eos_token_ids"] = self.eos_token_ids
 
diff --git a/test/utils/test_custom_chat_template.py b/test/utils/test_custom_chat_template.py
@@ -1,22 +1,83 @@
 import os
 import unittest
+from pathlib import Path
+from unittest.mock import mock_open, patch, MagicMock
+from fastdeploy.entrypoints.llm import LLM
 from fastdeploy.entrypoints.chat_utils import load_chat_template
+from fastdeploy.engine.sampling_params import SamplingParams
+from fastdeploy.entrypoints.openai.protocol import (
+    ChatCompletionRequest
+)
+from fastdeploy.entrypoints.openai.serving_chat import OpenAIServingChat
 
-input_chat_template = "unit test \n"
 
-class TestChatTemplate(unittest.TestCase):
+class TestLodChatTemplate(unittest.IsolatedAsyncioTestCase):
+    
+    def setUp(self):
+        """
+        Set up the test environment by creating an instance of the LLM class using Mock.
+        """
+        self.input_chat_template = "unit test \n"
+        self.mock_engine = MagicMock()
+    
+    def test_load_chat_template_non(self):
+        result = load_chat_template(None)
+        self.assertEqual(None, result)
+
     def test_load_chat_template_str(self):
-        result = load_chat_template(input_chat_template)
-        self.assertEqual(input_chat_template, result)
+        result = load_chat_template(self.input_chat_template)
+        self.assertEqual(self.input_chat_template, result)
 
     def test_load_chat_template_path(self):
-        with open("chat_template", 'w', encoding='utf-8') as file:
-            file.write(input_chat_template)
+        with open("chat_template", "w", encoding="utf-8") as file:
+            file.write(self.input_chat_template)
         file_path = os.path.join(os.getcwd(), "chat_template")
         result = load_chat_template(file_path)
         os.remove(file_path)
-        self.assertEqual(input_chat_template, result)
+        self.assertEqual(self.input_chat_template, result)
+
+    def test_load_chat_template_non_str_and_path(self):
+        with self.assertRaises(ValueError):
+            load_chat_template("unit test")
+
+    def test_path_with_literal_true(self):
+        with self.assertRaises(TypeError):
+            load_chat_template(Path("./chat_template"), is_literal=True)
+
+    def test_path_object_file_error(self):
+        with patch("builtins.open", mock_open()) as mock_file:
+            mock_file.side_effect = OSError("File error")
+            with self.assertRaises(OSError):
+                load_chat_template(Path("./chat_template"))
+                
+    async def test_serving_chat(self):
+        request = ChatCompletionRequest(messages=[{"role": "user", "content":"你好"}])
+        self.chat_completion_handler = OpenAIServingChat(self.mock_engine, pid=123, ips=None, chat_template=self.input_chat_template)
+        
+        async def mock_chat_completion_full_generator(request, request_id, model_name, prompt_token_ids):
+            return prompt_token_ids
+        def mock_format_and_add_data(current_req_dict):
+            return current_req_dict
+        self.chat_completion_handler.chat_completion_full_generator = mock_chat_completion_full_generator
+        self.chat_completion_handler.engine_client.format_and_add_data = mock_format_and_add_data
+        chat_completiom = await self.chat_completion_handler.create_chat_completion(request)
+        self.assertEqual(self.input_chat_template, chat_completiom["chat_template"])
+        
+    async def test_serving_chat_cus(self):
+        request = ChatCompletionRequest(messages=[{"role": "user", "content":"你好"}], chat_template="hello")
+        self.chat_completion_handler = OpenAIServingChat(self.mock_engine, pid=123, ips=None, chat_template=self.input_chat_template)
+        
+        async def mock_chat_completion_full_generator(request, request_id, model_name, prompt_token_ids):
+            return prompt_token_ids
+        def mock_format_and_add_data(current_req_dict):
+            return current_req_dict
+        self.chat_completion_handler.chat_completion_full_generator = mock_chat_completion_full_generator
+        self.chat_completion_handler.engine_client.format_and_add_data = mock_format_and_add_data
+        chat_completion = await self.chat_completion_handler.create_chat_completion(request)
+        self.assertEqual("hello", chat_completion["chat_template"])
+                
+                
+
 
 if __name__ == "__main__":
     unittest.main()
-    

Original file line number	Diff line number	Diff line change
`@@ -501,7 +501,6 @@ def apply_chat_template(self, request):`
`501`	`501`	`tokenize=False,`
`502`	`502`	`add_generation_prompt=request.get("add_generation_prompt", True),`
`503`	`503`	`chat_template=request.get("chat_template", None),`
`504`		`-`
`505`	`504`	`)`
`506`	`505`	`.replace("<\|image@placeholder\|>", "")`
`507`	`506`	`.replace("<\|video@placeholder\|>", "")`