Skip to content

Completion add raw_prediction/text_after_process #3362

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions fastdeploy/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ class ChatMessage(BaseModel):
tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None
prompt_token_ids: Optional[List[int]] = None
completion_token_ids: Optional[List[int]] = None
text_after_process: Optional[str] = None
raw_prediction: Optional[str] = None


class ChatCompletionResponseChoice(BaseModel):
Expand Down Expand Up @@ -183,6 +185,8 @@ class DeltaMessage(BaseModel):
completion_token_ids: Optional[List[int]] = None
reasoning_content: Optional[str] = None
tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None
text_after_process: Optional[str] = None
raw_prediction: Optional[str] = None


class ChatCompletionResponseStreamChoice(BaseModel):
Expand Down Expand Up @@ -219,6 +223,8 @@ class CompletionResponseChoice(BaseModel):
text: str
prompt_token_ids: Optional[List[int]] = None
completion_token_ids: Optional[List[int]] = None
text_after_process: Optional[str] = None
raw_prediction: Optional[str] = None
arrival_time: Optional[float] = None
logprobs: Optional[CompletionLogprobs] = None
reasoning_content: Optional[str] = None
Expand Down Expand Up @@ -261,6 +267,8 @@ class CompletionResponseStreamChoice(BaseModel):
logprobs: Optional[CompletionLogprobs] = None
prompt_token_ids: Optional[List[int]] = None
completion_token_ids: Optional[List[int]] = None
text_after_process: Optional[str] = None
raw_prediction: Optional[str] = None
reasoning_content: Optional[str] = None
finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None
Expand Down
17 changes: 14 additions & 3 deletions fastdeploy/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,12 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
else:
request_id = f"chatcmpl-{uuid.uuid4()}"
api_server_logger.info(f"create chat completion request: {request_id}")

text_after_process = None
try:
current_req_dict = request.to_dict_for_infer(request_id)
current_req_dict["arrival_time"] = time.time()
prompt_token_ids = self.engine_client.format_and_add_data(current_req_dict)
text_after_process = current_req_dict.get("text_after_process")
if isinstance(prompt_token_ids, np.ndarray):
prompt_token_ids = prompt_token_ids.tolist()
except Exception as e:
Expand All @@ -105,10 +106,14 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}")

if request.stream:
return self.chat_completion_stream_generator(request, request_id, request.model, prompt_token_ids)
return self.chat_completion_stream_generator(
request, request_id, request.model, prompt_token_ids, text_after_process
)
else:
try:
return await self.chat_completion_full_generator(request, request_id, request.model, prompt_token_ids)
return await self.chat_completion_full_generator(
request, request_id, request.model, prompt_token_ids, text_after_process
)
except Exception as e:
return ErrorResponse(code=400, message=str(e))

Expand All @@ -125,6 +130,7 @@ async def chat_completion_stream_generator(
request_id: str,
model_name: str,
prompt_token_ids: list(),
text_after_process: str,
):
"""
Streaming chat completion generator.
Expand Down Expand Up @@ -217,6 +223,7 @@ async def chat_completion_stream_generator(
)
if request.return_token_ids:
choice.delta.prompt_token_ids = list(prompt_token_ids)
choice.delta.text_after_process = text_after_process
chunk = ChatCompletionStreamResponse(
id=request_id,
object=chunk_object_type,
Expand Down Expand Up @@ -280,6 +287,7 @@ async def chat_completion_stream_generator(

if request.return_token_ids:
choice.delta.completion_token_ids = list(output["token_ids"])
choice.delta.raw_prediction = output.get("raw_prediction")
if include_continuous_usage:
chunk.usage = UsageInfo(
prompt_tokens=num_prompt_tokens,
Expand Down Expand Up @@ -330,6 +338,7 @@ async def chat_completion_full_generator(
request_id: str,
model_name: str,
prompt_token_ids: list(),
text_after_process: str,
):
"""
Full chat completion generator.
Expand Down Expand Up @@ -408,6 +417,8 @@ async def chat_completion_full_generator(
tool_calls=output.get("tool_call_content"),
prompt_token_ids=prompt_token_ids if request.return_token_ids else None,
completion_token_ids=completion_token_ids if request.return_token_ids else None,
text_after_process=text_after_process if request.return_token_ids else None,
raw_prediction=output.get("raw_prediction") if request.return_token_ids else None,
)
logprobs_full_res = None
if logprob_contents:
Expand Down
12 changes: 12 additions & 0 deletions fastdeploy/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ async def create_completion(self, request: CompletionRequest):

api_server_logger.info(f"start inference for request {num_choices}")
prompt_batched_token_ids = []
text_after_process_list = []
try:
for idx, prompt in enumerate(request_prompts):
request_id_idx = f"{request_id}-{idx}"
Expand All @@ -109,6 +110,7 @@ async def create_completion(self, request: CompletionRequest):
prompt_token_ids = self.engine_client.format_and_add_data(current_req_dict)
if isinstance(prompt_token_ids, np.ndarray):
prompt_token_ids = prompt_token_ids.tolist()
text_after_process_list.append(current_req_dict.get("text_after_process"))
prompt_batched_token_ids.append(prompt_token_ids)
except Exception as e:
return ErrorResponse(message=str(e), code=400)
Expand All @@ -131,6 +133,7 @@ async def create_completion(self, request: CompletionRequest):
created_time=created_time,
model_name=request.model,
prompt_batched_token_ids=prompt_batched_token_ids,
text_after_process_list=text_after_process_list,
)
else:
try:
Expand All @@ -141,6 +144,7 @@ async def create_completion(self, request: CompletionRequest):
created_time=created_time,
model_name=request.model,
prompt_batched_token_ids=prompt_batched_token_ids,
text_after_process_list=text_after_process_list,
)
except Exception as e:
return ErrorResponse(code=400, message=str(e))
Expand All @@ -156,6 +160,7 @@ async def completion_full_generator(
created_time: int,
model_name: str,
prompt_batched_token_ids: list(),
text_after_process_list: list(),
):
"""
Process the full completion request with multiple choices.
Expand Down Expand Up @@ -225,6 +230,7 @@ async def completion_full_generator(
model_name=model_name,
prompt_batched_token_ids=prompt_batched_token_ids,
completion_batched_token_ids=completion_batched_token_ids,
text_after_process_list=text_after_process_list,
)
except Exception as e:
api_server_logger.error(f"Error in completion_full_generator: {e}", exc_info=True)
Expand All @@ -251,6 +257,7 @@ async def completion_stream_generator(
created_time: int,
model_name: str,
prompt_batched_token_ids: list(),
text_after_process_list: list(),
):
"""
Process the stream completion request.
Expand Down Expand Up @@ -309,6 +316,7 @@ async def completion_stream_generator(
index=idx,
text="",
prompt_token_ids=list(prompt_batched_token_ids[idx]),
text_after_process=text_after_process_list[idx],
completion_token_ids=None,
)
],
Expand Down Expand Up @@ -337,6 +345,7 @@ async def completion_stream_generator(
text=output["text"],
prompt_token_ids=None,
completion_token_ids=output.get("token_ids") if request.return_token_ids else None,
raw_prediction=output.get("raw_prediction") if request.return_token_ids else None,
tool_calls=output.get("tool_call_content"),
reasoning_content=output.get("reasoning_content"),
arrival_time=arrival_time,
Expand Down Expand Up @@ -397,6 +406,7 @@ def request_output_to_completion_response(
model_name: str,
prompt_batched_token_ids: list(),
completion_batched_token_ids: list(),
text_after_process_list: list(),
) -> CompletionResponse:
choices: List[CompletionResponseChoice] = []
num_prompt_tokens = 0
Expand Down Expand Up @@ -443,6 +453,8 @@ def request_output_to_completion_response(
text=output_text,
prompt_token_ids=prompt_token_ids if request.return_token_ids else None,
completion_token_ids=completion_token_ids if request.return_token_ids else None,
raw_prediction=output.get("raw_prediction") if request.return_token_ids else None,
text_after_process=text_after_process_list[idx] if request.return_token_ids else None,
reasoning_content=output.get("reasoning_content"),
tool_calls=output.get("tool_call_content"),
logprobs=aggregated_logprobs,
Expand Down
6 changes: 4 additions & 2 deletions fastdeploy/input/ernie_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def process_request_dict(self, request, max_model_len=None):
if request.get("prompt"):
prompt = request.get("prompt")
prompt = prompt[0] if isinstance(prompt, list) else prompt

request["text_after_process"] = prompt
tokens = self.tokenizer.tokenize(prompt)
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
request["prompt_token_ids"] = token_ids
Expand Down Expand Up @@ -250,6 +250,7 @@ def process_response_dict_normal(self, response_dict, **kwargs):
response_dict["outputs"]["reasoning_content"] = reasoning_content
else:
response_dict["outputs"]["text"] = full_text
response_dict["outputs"]["raw_prediction"] = full_text
data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
del self.decode_status[req_id]
return response_dict
Expand Down Expand Up @@ -286,6 +287,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
response_dict["outputs"]["reasoning_content"] = reasoning_content
else:
response_dict["outputs"]["text"] = delta_text
response_dict["outputs"]["raw_prediction"] = delta_text
if is_end:
data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
del self.decode_status[req_id]
Expand All @@ -310,7 +312,7 @@ def messages2ids(self, request_or_messages):
split_special_tokens=False,
add_special_tokens=False,
)

request_or_messages["text_after_process"] = spliced_message
req_id = None
if isinstance(request_or_messages, dict):
req_id = request_or_messages.get("request_id", None)
Expand Down
1 change: 1 addition & 0 deletions fastdeploy/input/ernie_vl_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ def process_request_dict(self, request, max_model_len=None):
self._check_mm_limits(multimodal_data)
images = multimodal_data.get("image", None)
videos = multimodal_data.get("video", None)
request["text_after_process"] = request.get("prompt")
outputs = self.ernie_processor.text2ids(request["prompt"], images, videos)
elif request.get("messages"):
messages = request["messages"]
Expand Down
17 changes: 8 additions & 9 deletions fastdeploy/input/mm_processor/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,16 +494,15 @@ def apply_chat_template(self, request):
"""
if self.tokenizer.chat_template is None:
raise ValueError("This model does not support chat_template.")

prompt_token_str = (
self.tokenizer.apply_chat_template(
request,
tokenize=False,
add_generation_prompt=request.get("add_generation_prompt", True),
)
.replace("<|image@placeholder|>", "")
.replace("<|video@placeholder|>", "")
prompt_token_template = self.tokenizer.apply_chat_template(
request,
tokenize=False,
add_generation_prompt=request.get("add_generation_prompt", True),
)
prompt_token_str = prompt_token_template.replace("<|image@placeholder|>", "").replace(
"<|video@placeholder|>", ""
)
request["text_after_process"] = prompt_token_template
tokens = self.tokenizer.tokenize(prompt_token_str)
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
data_processor_logger.info(
Expand Down
5 changes: 4 additions & 1 deletion fastdeploy/input/text_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ def process_request_dict(self, request, max_model_len=None, **kwargs):
# processing prompt_token_ids
if not request.get("prompt_token_ids"):
if "prompt" in request:
request["text_after_process"] = request["prompt"]
request["prompt_token_ids"] = self.text2ids(request["prompt"], max_model_len).tolist()
elif "messages" in request:
if self.tokenizer.chat_template is None:
Expand Down Expand Up @@ -352,6 +353,7 @@ def process_response_dict_normal(self, response_dict, **kwargs):
delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id)
if is_end:
full_text = previous_texts + delta_text
response_dict["outputs"]["raw_prediction"] = full_text
if enable_thinking and self.reasoning_parser:
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
response_dict["outputs"]["text"] = text
Expand Down Expand Up @@ -381,7 +383,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
if token_ids[-1] == self.tokenizer.eos_token_id:
token_ids = token_ids[:-1]
delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)

response_dict["outputs"]["raw_prediction"] = delta_text
if enable_thinking and self.reasoning_parser:
reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming(
previous_texts,
Expand Down Expand Up @@ -472,6 +474,7 @@ def messages2ids(self, request):
add_special_tokens=False,
return_tensors="pd",
)
request["text_after_process"] = spliced_message
req_id = None
tokens = self.tokenizer.tokenize(spliced_message)
if isinstance(request, dict):
Expand Down
Loading