Skip to content

Commit 2b6fe1b

Browse files
committed
add offline
1 parent 39ac40f commit 2b6fe1b

File tree

6 files changed

+86
-42
lines changed

6 files changed

+86
-42
lines changed

fastdeploy/engine/request.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,25 @@ class RequestType(Enum):
4141
PREEMPTED = 2
4242

4343

44+
class ToolCall:
45+
"""
46+
Tool call.
47+
"""
48+
49+
id: str = None
50+
type: Literal["function"] = "function"
51+
function: FunctionCall
52+
53+
54+
class DeltaFunctionCall:
55+
"""
56+
Delta function call.
57+
"""
58+
59+
name: Optional[str] = None
60+
arguments: Optional[str] = None
61+
62+
4463
@dataclass
4564
class Request:
4665
def __init__(
@@ -249,6 +268,7 @@ class CompletionOutput:
249268
draft_token_ids: list[int] = None
250269
text: Optional[str] = None
251270
reasoning_content: Optional[str] = None
271+
tool_calls: Optional[ToolCall] = None
252272

253273
def to_dict(self):
254274
"""

fastdeploy/entrypoints/llm.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from fastdeploy.engine.args_utils import EngineArgs
2929
from fastdeploy.engine.engine import LLMEngine
3030
from fastdeploy.engine.sampling_params import SamplingParams
31+
from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager
3132
from fastdeploy.plugins.model_register import load_model_register_plugins
3233
from fastdeploy.utils import (
3334
deprecated_kwargs_warning,
@@ -79,6 +80,9 @@ def __init__(
7980

8081
load_model_register_plugins()
8182
model = retrive_model_from_server(model, revision)
83+
tool_parser_plugin = kwargs.get("tool_parser_plugin")
84+
if tool_parser_plugin:
85+
ToolParserManager.import_tool_parser(args.tool_parser_plugin)
8286
engine_args = EngineArgs(
8387
model=model,
8488
tokenizer=tokenizer,

fastdeploy/entrypoints/openai/serving_chat.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ async def chat_completion_stream_generator(
125125
previous_num_tokens = 0
126126
num_prompt_tokens = 0
127127
num_choices = 1
128+
tool_called = False
128129
max_streaming_response_tokens = (
129130
request.max_streaming_response_tokens
130131
if request.max_streaming_response_tokens is not None
@@ -227,6 +228,7 @@ async def chat_completion_stream_generator(
227228
output = res["outputs"]
228229
delta_text = output["text"]
229230
output_top_logprobs = output["top_logprobs"]
231+
previous_num_tokens += len(output["token_ids"])
230232
logprobs_res: Optional[LogProbs] = None
231233
if request.logprobs and output_top_logprobs is not None:
232234
logprobs_res = self._create_chat_logprobs(
@@ -236,17 +238,17 @@ async def chat_completion_stream_generator(
236238
tool_delta_message = output["tool_delta_message"]
237239
if tool_delta_message is None:
238240
continue
239-
else:
240-
delta_message = tool_delta_message
241+
delta_message = tool_delta_message
242+
delta_message.reasoning_content = output.get("reasoning_content")
243+
tool_called = True
241244
else:
242245
delta_message = DeltaMessage(
243-
content=delta_text,
244-
reasoning_content=output.get("reasoning_content"),
245-
prompt_token_ids=None,
246-
completion_token_ids=None,
247-
tool_calls=None,
248-
)
249-
previous_num_tokens += len(output["token_ids"])
246+
content=delta_text,
247+
reasoning_content=output.get("reasoning_content"),
248+
prompt_token_ids=None,
249+
completion_token_ids=None,
250+
tool_calls=None,
251+
)
250252

251253
choice = ChatCompletionResponseStreamChoice(
252254
index=0,
@@ -263,10 +265,7 @@ async def chat_completion_stream_generator(
263265
max_tokens = request.max_completion_tokens or request.max_tokens
264266
if has_no_token_limit or previous_num_tokens != max_tokens:
265267
choice.finish_reason = "stop"
266-
if (
267-
self.engine_client.reasoning_parser == "ernie_x1"
268-
and output.get("finish_reason", "") == "tool_calls"
269-
):
268+
if tool_called:
270269
choice.finish_reason = "tool_calls"
271270
else:
272271
choice.finish_reason = "length"

fastdeploy/entrypoints/openai/serving_completion.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,7 @@ async def completion_stream_generator(
245245
output_tokens = [0] * num_choices
246246
inference_start_time = [0] * num_choices
247247
first_iteration = [True] * num_choices
248+
tool_called = False
248249
max_streaming_response_tokens = (
249250
request.max_streaming_response_tokens
250251
if request.max_streaming_response_tokens is not None
@@ -311,32 +312,42 @@ async def completion_stream_generator(
311312
logprobs_res: Optional[CompletionLogprobs] = None
312313
if request.logprobs and output_top_logprobs is not None:
313314
logprobs_res = self._create_completion_logprobs(output_top_logprobs, request.logprobs, 0)
314-
315-
choices.append(
316-
CompletionResponseStreamChoice(
315+
output_tokens[idx] += 1
316+
if self.engine_client.data_processor.tool_parser and not res["finished"]:
317+
tool_delta_message = output["tool_delta_message"]
318+
if tool_delta_message is None:
319+
continue
320+
delta_message = CompletionResponseStreamChoice(
321+
index=idx,
322+
text=output["text"],
323+
completion_token_ids=output.get("token_ids") if request.return_token_ids else None,
324+
tool_calls=delta_message.tool_calls,
325+
reasoning_content=output.get("reasoning_content"),
326+
arrival_time=arrival_time,
327+
logprobs=logprobs_res,
328+
)
329+
tool_called = True
330+
else:
331+
delta_message = CompletionResponseStreamChoice(
317332
index=idx,
318333
text=output["text"],
319334
prompt_token_ids=None,
320335
completion_token_ids=output.get("token_ids") if request.return_token_ids else None,
321-
tool_calls=output.get("tool_call_content"),
336+
tool_calls=None,
322337
reasoning_content=output.get("reasoning_content"),
323338
arrival_time=arrival_time,
324339
logprobs=logprobs_res,
325340
)
326-
)
341+
342+
choices.append(delta_message)
327343
if res["finished"]:
328344
if request.max_tokens is None or output_tokens[idx] + 1 != request.max_tokens:
329345
chunk.choices[0].finish_reason = "stop"
330-
if (
331-
self.engine_client.reasoning_parser == "ernie_x1"
332-
and output.get("finish_reason", "") == "tool_calls"
333-
):
346+
if tool_called:
334347
chunk.choices[0].finish_reason = "tool_calls"
335348
else:
336349
chunk.choices[0].finish_reason = "length"
337350

338-
output_tokens[idx] += 1
339-
340351
if len(choices) == max_streaming_response_tokens or res["finished"]:
341352
chunk = CompletionStreamResponse(
342353
id=request_id,
@@ -428,7 +439,7 @@ def request_output_to_completion_response(
428439
prompt_token_ids=prompt_token_ids if request.return_token_ids else None,
429440
completion_token_ids=completion_token_ids if request.return_token_ids else None,
430441
reasoning_content=output.get("reasoning_content"),
431-
tool_calls=output.get("tool_call_content"),
442+
tool_calls=output.get("tool_call"),
432443
logprobs=aggregated_logprobs,
433444
finish_reason=None,
434445
)

fastdeploy/input/ernie_processor.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,10 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_ob
6262
self.eos_token_id_len = len(self.eos_token_ids)
6363
self.pad_token_id = self.get_pad_id()
6464
self.reasoning_parser = None
65-
self.tool_parser = tool_parser_obj
65+
self.tool_parser_obj = tool_parser_obj
6666
if reasoning_parser_obj:
6767
self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
68+
6869
def _init_config(self):
6970
self.use_hf_tokenizer = int(envs.FD_USE_HF_TOKENIZER) == 1
7071

@@ -205,6 +206,12 @@ def process_response(self, response_dict, **kwargs):
205206
response_dict.outputs.reasoning_content = reasoning_content
206207
else:
207208
response_dict.outputs.text = full_text
209+
if self.tool_parser_obj:
210+
tool_parser = self.tool_parser_obj(self.tokenizer)
211+
tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
212+
if tool_call_info.tools_called:
213+
response_dict.outputs.tool_calls = tool_call_info.tool_calls
214+
response_dict.outputs.text = tool_call_info.content
208215
data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
209216
if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "":
210217
return None
@@ -251,10 +258,9 @@ def process_response_dict_normal(self, response_dict, **kwargs):
251258
response_dict["outputs"]["reasoning_content"] = reasoning_content
252259
else:
253260
response_dict["outputs"]["text"] = full_text
254-
if self.tool_parser:
255-
tool_parser = self.tool_parser(self.tokenizer)
256-
tool_call_info = tool_parser.extract_tool_calls(
257-
full_text, response_dict)
261+
if self.tool_parser_obj:
262+
tool_parser = self.tool_parser_obj(self.tokenizer)
263+
tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
258264
if tool_call_info.tools_called:
259265
response_dict["outputs"]["tool_call"] = tool_call_info.tool_calls
260266
response_dict["outputs"]["text"] = tool_call_info.content
@@ -276,7 +282,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
276282
is_end = response_dict["finished"]
277283
req_id = response_dict["request_id"]
278284
token_ids = response_dict["outputs"]["token_ids"]
279-
280285

281286
if is_end and len(token_ids) > 0 and not kwargs.get("include_stop_str_in_output"):
282287
if token_ids[-1] == self.tokenizer.eos_token_id:
@@ -295,9 +300,9 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
295300
response_dict["outputs"]["reasoning_content"] = reasoning_content
296301
else:
297302
response_dict["outputs"]["text"] = delta_text
298-
if self.tool_parser:
303+
if self.tool_parser_obj:
299304
if req_id not in self.tool_parsers:
300-
self.tool_parsers[req_id] = self.tool_parser(self.tokenizer)
305+
self.tool_parsers[req_id] = self.tool_parser_obj(self.tokenizer)
301306
tool_parser = self.tool_parsers[req_id]
302307
tool_call = tool_parser.extract_tool_calls_streaming(
303308
previous_texts,
@@ -306,7 +311,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
306311
previous_token_ids,
307312
previous_token_ids + token_ids,
308313
token_ids,
309-
response_dict
314+
response_dict,
310315
)
311316
response_dict["outputs"]["tool_delta_message"] = tool_call
312317
if is_end:

fastdeploy/input/text_processor.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_ob
181181
self.eos_token_id_len = len(self.eos_token_ids)
182182
self.pad_token_id = self.get_pad_id()
183183
self.reasoning_parser = None
184-
self.tool_parser = tool_parser_obj
184+
self.tool_parser_obj = tool_parser_obj
185185
if reasoning_parser_obj:
186186
self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
187187
self.tokenizer.pad_token_id = self.pad_token_id
@@ -330,6 +330,12 @@ def process_response(self, response_dict, **kwargs):
330330
else:
331331
# 模型不支持思考,并且没单独设置enable_thinking为false
332332
response_dict.outputs.text = full_text
333+
if self.tool_parser_obj:
334+
tool_parser = self.tool_parser_obj(self.tokenizer)
335+
tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
336+
if tool_call_info.tools_called:
337+
response_dict.outputs.tool_calls = tool_call_info.tool_calls
338+
response_dict.outputs.text = tool_call_info.content
333339
data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
334340

335341
return response_dict
@@ -360,10 +366,9 @@ def process_response_dict_normal(self, response_dict, **kwargs):
360366
response_dict["outputs"]["reasoning_content"] = reasoning_content
361367
else:
362368
response_dict["outputs"]["text"] = full_text
363-
if self.tool_parser:
364-
tool_parser = self.tool_parser(self.tokenizer)
365-
tool_call_info = tool_parser.extract_tool_calls(
366-
full_text, response_dict)
369+
if self.tool_parser_obj:
370+
tool_parser = self.tool_parser_obj(self.tokenizer)
371+
tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
367372
if tool_call_info.tools_called:
368373
response_dict["outputs"]["tool_call"] = tool_call_info.tool_calls
369374
response_dict["outputs"]["text"] = tool_call_info.content
@@ -404,9 +409,9 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
404409
response_dict["outputs"]["reasoning_content"] = reasoning_content
405410
else:
406411
response_dict["outputs"]["text"] = delta_text
407-
if self.tool_parser and not is_end:
412+
if self.tool_parser_obj and not is_end:
408413
if req_id not in self.tool_parsers:
409-
self.tool_parsers[req_id] = self.tool_parser(self.tokenizer)
414+
self.tool_parsers[req_id] = self.tool_parser_obj(self.tokenizer)
410415
tool_parser = self.tool_parsers[req_id]
411416
tool_call = tool_parser.extract_tool_calls_streaming(
412417
previous_texts,
@@ -415,7 +420,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
415420
previous_token_ids,
416421
previous_token_ids + token_ids,
417422
token_ids,
418-
response_dict
423+
response_dict,
419424
)
420425
response_dict["outputs"]["tool_delta_message"] = tool_call
421426
if is_end:

0 commit comments

Comments
 (0)