Skip to content

Commit b77da03

Browse files
committed
fix
1 parent f5f2c1f commit b77da03

File tree

12 files changed

+104
-37
lines changed

12 files changed

+104
-37
lines changed

docs/parameters.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ When using FastDeploy to deploy models (including offline inference and service
4444
| ```dynamic_load_weight``` | `int` | Whether to enable dynamic weight loading, default: 0 |
4545
| ```enable_expert_parallel``` | `bool` | Whether to enable expert parallel |
4646
| ```enable_logprob``` | `bool` | Whether to enable return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message.If logrpob is not used, this parameter can be omitted when starting |
47+
| ```chat_template``` | `str` | Specify the template used for model concatenation, It supports both string input and file path input. The default value is None. If not specified, the model's default template will be used. |
4748

4849
## 1. Relationship between KVCache allocation, ```num_gpu_blocks_override``` and ```block_size```?
4950

docs/zh/parameters.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
| ```dynamic_load_weight``` | `int` | 是否动态加载权重,默认0 |
4343
| ```enable_expert_parallel``` | `bool` | 是否启用专家并行 |
4444
| ```enable_logprob``` | `bool` | 是否启用输出token返回logprob。如果未使用 logrpob,则在启动时可以省略此参数。 |
45+
| ```chat_template``` | `str` | 指定模型拼接使用的模板,支持字符串与文件路径,默认为None,如未指定,则使用模型默认模板 |
4546

4647
## 1. KVCache分配与```num_gpu_blocks_override``````block_size```的关系?
4748

fastdeploy/engine/engine.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -497,10 +497,7 @@ def add_requests(self, task, sampling_params=None, **kwargs):
497497
request.sampling_params = sampling_params
498498
request.preprocess_start_time = time.time()
499499

500-
enable_thinking = None
501-
if kwargs is not None:
502-
enable_thinking = kwargs.get("enable_thinking", None)
503-
request = self.data_processor.process_request(request, self.cfg.max_model_len, enable_thinking=enable_thinking)
500+
request = self.data_processor.process_request(request, self.cfg.max_model_len, **kwargs)
504501
request.prompt_token_ids_len = len(request.prompt_token_ids)
505502
request.need_prefill_tokens = request.prompt_token_ids_len
506503
input_ids_len = request.prompt_token_ids_len

fastdeploy/entrypoints/chat_utils.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
"""
1616

1717
from copy import deepcopy
18-
from typing import List, Literal, Union, Optional
18+
from pathlib import Path
19+
from typing import List, Literal, Optional, Union
1920
from urllib.parse import urlparse
2021

2122
import requests
@@ -29,7 +30,6 @@
2930

3031
from fastdeploy.multimodal.image import ImageMediaIO
3132
from fastdeploy.multimodal.video import VideoMediaIO
32-
from pathlib import Path
3333

3434

3535
class VideoURL(TypedDict, total=False):
@@ -158,16 +158,19 @@ def parse_chat_messages(messages):
158158
conversation.append({"role": role, "content": parsed_content})
159159
return conversation
160160

161-
def load_chat_template(chat_template: Union[Path, str], is_literal: bool = False,) -> Optional[str]:
161+
162+
def load_chat_template(
163+
chat_template: Union[Path, str],
164+
is_literal: bool = False,
165+
) -> Optional[str]:
162166
if chat_template is None:
163167
return None
164168
if is_literal:
165169
if isinstance(chat_template, Path):
166-
raise TypeError("chat_template is expected to be read directly "
167-
"from its value")
170+
raise TypeError("chat_template is expected to be read directly " "from its value")
168171

169172
return chat_template
170-
173+
171174
try:
172175
with open(chat_template) as f:
173176
return f.read()
@@ -176,11 +179,13 @@ def load_chat_template(chat_template: Union[Path, str], is_literal: bool = False
176179
raise
177180
JINJA_CHARS = "{}\n"
178181
if not any(c in chat_template for c in JINJA_CHARS):
179-
msg = (f"The supplied chat template ({chat_template}) "
180-
f"looks like a file path, but it failed to be "
181-
f"opened. Reason: {e}")
182+
msg = (
183+
f"The supplied chat template ({chat_template}) "
184+
f"looks like a file path, but it failed to be "
185+
f"opened. Reason: {e}"
186+
)
182187
raise ValueError(msg) from e
183188

184189
# If opening a file fails, set chat template to be args to
185190
# ensure we decode so our escape are interpreted correctly
186-
return load_chat_template(chat_template, is_literal=True)
191+
return load_chat_template(chat_template, is_literal=True)

fastdeploy/entrypoints/llm.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,14 @@
2828
from fastdeploy.engine.args_utils import EngineArgs
2929
from fastdeploy.engine.engine import LLMEngine
3030
from fastdeploy.engine.sampling_params import SamplingParams
31+
from fastdeploy.entrypoints.chat_utils import load_chat_template
3132
from fastdeploy.plugins.model_register import load_model_register_plugins
3233
from fastdeploy.utils import (
3334
deprecated_kwargs_warning,
3435
llm_logger,
3536
retrive_model_from_server,
3637
)
3738
from fastdeploy.worker.output import Logprob, LogprobsLists
38-
from fastdeploy.entrypoints.chat_utils import load_chat_template
3939

4040
root_logger = logging.getLogger()
4141
for handler in root_logger.handlers[:]:
@@ -228,17 +228,18 @@ def chat(
228228

229229
if sampling_params_len != 1 and len(messages) != sampling_params_len:
230230
raise ValueError("messages and sampling_params must be the same length.")
231-
231+
232232
if chat_template is None:
233233
chat_template = self.chat_template
234234

235235
messages_len = len(messages)
236236
for i in range(messages_len):
237-
messages[i] = {"messages": messages[i], "chat_template": chat_template}
237+
messages[i] = {"messages": messages[i]}
238238
req_ids = self._add_request(
239239
prompts=messages,
240240
sampling_params=sampling_params,
241241
chat_template_kwargs=chat_template_kwargs,
242+
chat_template=chat_template,
242243
)
243244

244245
topk_logprobs = sampling_params[0].logprobs if sampling_params_len > 1 else sampling_params.logprobs
@@ -251,7 +252,7 @@ def _add_request(
251252
self,
252253
prompts,
253254
sampling_params,
254-
chat_template_kwargs: Optional[dict[str, Any]] = None,
255+
**kwargs,
255256
):
256257
"""
257258
添加一个请求到 LLM Engine,并返回该请求的 ID。
@@ -292,10 +293,7 @@ def _add_request(
292293
current_sampling_params = sampling_params[i]
293294
else:
294295
current_sampling_params = sampling_params
295-
enable_thinking = None
296-
if chat_template_kwargs is not None:
297-
enable_thinking = chat_template_kwargs.get("enable_thinking", None)
298-
self.llm_engine.add_requests(tasks, current_sampling_params, enable_thinking=enable_thinking)
296+
self.llm_engine.add_requests(tasks, current_sampling_params, **kwargs)
299297
return req_ids
300298

301299
def _decode_token(self, token_id: int) -> str:

fastdeploy/entrypoints/openai/api_server.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828

2929
from fastdeploy.engine.args_utils import EngineArgs
3030
from fastdeploy.engine.engine import LLMEngine
31+
from fastdeploy.entrypoints.chat_utils import load_chat_template
3132
from fastdeploy.entrypoints.engine_client import EngineClient
3233
from fastdeploy.entrypoints.openai.protocol import (
3334
ChatCompletionRequest,
@@ -54,7 +55,6 @@
5455
is_port_available,
5556
retrive_model_from_server,
5657
)
57-
from fastdeploy.entrypoints.chat_utils import load_chat_template
5858

5959
parser = FlexibleArgumentParser()
6060
parser.add_argument("--port", default=8000, type=int, help="port to the http server")
@@ -65,6 +65,7 @@
6565
parser = EngineArgs.add_cli_args(parser)
6666
args = parser.parse_args()
6767
args.model = retrive_model_from_server(args.model, args.revision)
68+
chat_template = load_chat_template(args.chat_template)
6869

6970
llm_engine = None
7071

@@ -105,7 +106,6 @@ async def lifespan(app: FastAPI):
105106
pid = os.getppid()
106107
else:
107108
pid = os.getpid()
108-
chat_template = load_chat_template(args.chat_template)
109109
api_server_logger.info(f"{pid}")
110110
engine_client = EngineClient(
111111
args.model,

fastdeploy/entrypoints/openai/serving_chat.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,9 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
8585
api_server_logger.info(f"create chat completion request: {request_id}")
8686

8787
try:
88-
if request.chat_template is None:
89-
request.chat_template = self.chat_template
9088
current_req_dict = request.to_dict_for_infer(request_id)
89+
if "chat_template" not in current_req_dict:
90+
current_req_dict["chat_template"] = self.chat_template
9191
current_req_dict["arrival_time"] = time.time()
9292
prompt_token_ids = self.engine_client.format_and_add_data(current_req_dict)
9393
if isinstance(prompt_token_ids, np.ndarray):

fastdeploy/input/ernie_processor.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ def process_request(self, request, max_model_len=None, **kwargs):
8888
bool: Whether preprocessing is successful
8989
str: error message
9090
"""
91+
request.chat_template = kwargs.get("chat_template")
9192
request = self._apply_default_parameters(request)
9293
if request.get("eos_token_ids") is None or len(request.eos_token_ids) == 0:
9394
request.eos_token_ids = self.eos_token_ids
@@ -139,6 +140,7 @@ def process_request_dict(self, request, max_model_len=None):
139140
str: error message
140141
"""
141142
request = self._apply_default_parameters(request)
143+
request["enable_thinking"] = request.get("chat_template_kwargs", {}).get("enable_thinking")
142144
if not request.get("eos_token_ids"):
143145
request["eos_token_ids"] = self.eos_token_ids
144146

@@ -309,7 +311,7 @@ def messages2ids(self, request_or_messages):
309311
tokenize=False,
310312
split_special_tokens=False,
311313
add_special_tokens=False,
312-
chat_template=request_or_messages.get("chat_template", None)
314+
chat_template=request_or_messages.get("chat_template", None),
313315
)
314316

315317
req_id = None

fastdeploy/input/ernie_vl_processor.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,9 @@ def set_value(req, key, value):
108108

109109
def process_request(self, request, max_model_len=None, **kwargs):
110110
"""process the input data"""
111+
request.chat_template = kwargs.get("chat_template")
111112
task = request.to_dict()
112-
task["enable_thinking"] = kwargs.get("enable_thinking", True)
113+
task["enable_thinking"] = task.get("chat_template_kwargs", {}).get("enable_thinking", True)
113114
self.process_request_dict(task, max_model_len)
114115
request = Request.from_dict(task)
115116
request = self._apply_default_parameters(request)

fastdeploy/input/mm_processor/process.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -501,7 +501,6 @@ def apply_chat_template(self, request):
501501
tokenize=False,
502502
add_generation_prompt=request.get("add_generation_prompt", True),
503503
chat_template=request.get("chat_template", None),
504-
505504
)
506505
.replace("<|image@placeholder|>", "")
507506
.replace("<|video@placeholder|>", "")

0 commit comments

Comments
 (0)