@@ -78,47 +78,47 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
78
78
err_msg = f"Only master node can accept completion request, please send request to master node: { self .pod_ips [0 ]} "
79
79
api_server_logger .error (err_msg )
80
80
return ErrorResponse (message = err_msg , code = 400 )
81
-
82
- if request .user is not None :
83
- request_id = f"chatcmpl-{ request .user } -{ uuid .uuid4 ()} "
84
- else :
85
- request_id = f"chatcmpl-{ uuid .uuid4 ()} "
86
- api_server_logger .info (f"create chat completion request: { request_id } " )
87
- text_after_process = None
88
81
try :
89
- current_req_dict = request .to_dict_for_infer (request_id )
90
- if "chat_template" not in current_req_dict :
91
- current_req_dict ["chat_template" ] = self .chat_template
92
- current_req_dict ["arrival_time" ] = time .time ()
93
- prompt_token_ids = self .engine_client .format_and_add_data (current_req_dict )
94
- text_after_process = current_req_dict .get ("text_after_process" )
95
- if isinstance (prompt_token_ids , np .ndarray ):
96
- prompt_token_ids = prompt_token_ids .tolist ()
97
- except Exception as e :
98
- return ErrorResponse (code = 400 , message = str (e ))
99
-
100
- del current_req_dict
101
- try :
102
- api_server_logger .debug (f"{ self .engine_client .semaphore .status ()} " )
103
82
if self .max_waiting_time < 0 :
104
83
await self .engine_client .semaphore .acquire ()
105
84
else :
106
85
await asyncio .wait_for (self .engine_client .semaphore .acquire (), timeout = self .max_waiting_time )
107
- except Exception :
108
- return ErrorResponse (code = 408 , message = f"Request queued time exceed { self .max_waiting_time } " )
86
+ api_server_logger .info (f"current { self .engine_client .semaphore .status ()} " )
109
87
110
- if request .stream :
111
- return self .chat_completion_stream_generator (
112
- request , request_id , request .model , prompt_token_ids , text_after_process
113
- )
114
- else :
88
+ if request .user is not None :
89
+ request_id = f"chatcmpl-{ request .user } -{ uuid .uuid4 ()} "
90
+ else :
91
+ request_id = f"chatcmpl-{ uuid .uuid4 ()} "
92
+ api_server_logger .info (f"create chat completion request: { request_id } " )
93
+ text_after_process = None
115
94
try :
116
- return await self .chat_completion_full_generator (
117
- request , request_id , request .model , prompt_token_ids , text_after_process
118
- )
95
+ current_req_dict = request .to_dict_for_infer (request_id )
96
+ if "chat_template" not in current_req_dict :
97
+ current_req_dict ["chat_template" ] = self .chat_template
98
+ current_req_dict ["arrival_time" ] = time .time ()
99
+ prompt_token_ids = self .engine_client .format_and_add_data (current_req_dict )
100
+ text_after_process = current_req_dict .get ("text_after_process" )
101
+ if isinstance (prompt_token_ids , np .ndarray ):
102
+ prompt_token_ids = prompt_token_ids .tolist ()
119
103
except Exception as e :
120
104
return ErrorResponse (code = 400 , message = str (e ))
121
105
106
+ del current_req_dict
107
+
108
+ if request .stream :
109
+ return self .chat_completion_stream_generator (
110
+ request , request_id , request .model , prompt_token_ids , text_after_process
111
+ )
112
+ else :
113
+ try :
114
+ return await self .chat_completion_full_generator (
115
+ request , request_id , request .model , prompt_token_ids , text_after_process
116
+ )
117
+ except Exception as e :
118
+ return ErrorResponse (code = 400 , message = str (e ))
119
+ except Exception :
120
+ return ErrorResponse (code = 408 , message = f"Request queued time exceed { self .max_waiting_time } " )
121
+
122
122
def _create_streaming_error_response (self , message : str ) -> str :
123
123
error_response = ErrorResponse (
124
124
code = 400 ,
@@ -254,6 +254,7 @@ async def chat_completion_stream_generator(
254
254
logprobs_res = self ._create_chat_logprobs (
255
255
output_top_logprobs , request .logprobs , request .top_logprobs
256
256
)
257
+
257
258
if self .engine_client .data_processor .tool_parser_obj and not res ["finished" ]:
258
259
tool_delta_message = output ["tool_delta_message" ]
259
260
if tool_delta_message is None :
@@ -277,7 +278,6 @@ async def chat_completion_stream_generator(
277
278
logprobs = logprobs_res ,
278
279
arrival_time = arrival_time ,
279
280
)
280
-
281
281
if res ["finished" ]:
282
282
num_choices -= 1
283
283
work_process_metrics .e2e_request_latency .observe (
@@ -309,7 +309,6 @@ async def chat_completion_stream_generator(
309
309
if len (choices ) == max_streaming_response_tokens or res ["finished" ]:
310
310
chunk .choices = choices
311
311
yield f"data: { chunk .model_dump_json (exclude_unset = True )} \n \n "
312
- # 打印尾包
313
312
if res ["finished" ]:
314
313
api_server_logger .info (f"Chat Streaming response last send: { chunk .model_dump_json ()} " )
315
314
choices = []
@@ -417,8 +416,9 @@ async def chat_completion_full_generator(
417
416
if task_is_finished :
418
417
break
419
418
finally :
420
- self .engine_client .semaphore .release ()
421
419
dealer .close ()
420
+ self .engine_client .semaphore .release ()
421
+ api_server_logger .info (f"release { self .engine_client .semaphore .status ()} " )
422
422
423
423
choices = []
424
424
output = final_res ["outputs" ]
0 commit comments