Skip to content

Commit 417fc72

Browse files
authored
Align completion and chat_completion response to OpenAI API (sgl-project#4637)
1 parent c6ec702 commit 417fc72

File tree

1 file changed

+64
-6
lines changed

1 file changed

+64
-6
lines changed

python/sglang/srt/openai_api/adapter.py

Lines changed: 64 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -314,20 +314,27 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
314314
)
315315

316316
try:
317+
created = int(time.time())
317318
ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
318319
if not isinstance(ret, list):
319320
ret = [ret]
320321
if end_point == "/v1/chat/completions":
321322
responses = v1_chat_generate_response(
322323
request,
323324
ret,
325+
created,
324326
to_file=True,
325327
cache_report=tokenizer_manager.server_args.enable_cache_report,
326328
tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
327329
)
328330
else:
329331
responses = v1_generate_response(
330-
request, ret, tokenizer_manager, to_file=True
332+
request,
333+
ret,
334+
tokenizer_manager,
335+
created,
336+
to_file=True,
337+
cache_report=tokenizer_manager.server_args.enable_cache_report,
331338
)
332339

333340
except Exception as e:
@@ -577,7 +584,9 @@ def v1_generate_request(
577584
return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
578585

579586

580-
def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
587+
def v1_generate_response(
588+
request, ret, tokenizer_manager, created, to_file=False, cache_report=False
589+
):
581590
choices = []
582591
echo = False
583592

@@ -675,7 +684,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
675684
# remain the same but if needed we can change that
676685
"id": ret[i]["meta_info"]["id"],
677686
"object": "text_completion",
678-
"created": int(time.time()),
687+
"created": created,
679688
"model": request[i].model,
680689
"choices": choice,
681690
"usage": {
@@ -694,14 +703,19 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
694703
ret[i]["meta_info"]["prompt_tokens"] for i in range(0, len(ret), request.n)
695704
)
696705
completion_tokens = sum(item["meta_info"]["completion_tokens"] for item in ret)
706+
cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
697707
response = CompletionResponse(
698708
id=ret[0]["meta_info"]["id"],
699709
model=request.model,
710+
created=created,
700711
choices=choices,
701712
usage=UsageInfo(
702713
prompt_tokens=prompt_tokens,
703714
completion_tokens=completion_tokens,
704715
total_tokens=prompt_tokens + completion_tokens,
716+
prompt_tokens_details=(
717+
{"cached_tokens": cached_tokens} if cache_report else None
718+
),
705719
),
706720
)
707721
return response
@@ -710,6 +724,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
710724
async def v1_completions(tokenizer_manager, raw_request: Request):
711725
request_json = await raw_request.json()
712726
all_requests = [CompletionRequest(**request_json)]
727+
created = int(time.time())
713728
adapted_request, request = v1_generate_request(all_requests)
714729

715730
if adapted_request.stream:
@@ -719,6 +734,8 @@ async def generate_stream_resp():
719734
n_prev_tokens = {}
720735
prompt_tokens = {}
721736
completion_tokens = {}
737+
cached_tokens = {}
738+
722739
try:
723740
async for content in tokenizer_manager.generate_request(
724741
adapted_request, raw_request
@@ -731,6 +748,7 @@ async def generate_stream_resp():
731748
text = content["text"]
732749
prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
733750
completion_tokens[index] = content["meta_info"]["completion_tokens"]
751+
cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
734752

735753
if not stream_buffer: # The first chunk
736754
if request.echo:
@@ -803,6 +821,7 @@ async def generate_stream_resp():
803821
)
804822
chunk = CompletionStreamResponse(
805823
id=content["meta_info"]["id"],
824+
created=created,
806825
object="text_completion",
807826
choices=[choice_data],
808827
model=request.model,
@@ -821,14 +840,24 @@ async def generate_stream_resp():
821840
total_completion_tokens = sum(
822841
tokens for tokens in completion_tokens.values()
823842
)
843+
cache_report = tokenizer_manager.server_args.enable_cache_report
844+
if cache_report:
845+
cached_tokens_sum = sum(
846+
tokens for tokens in cached_tokens.values()
847+
)
848+
prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
849+
else:
850+
prompt_tokens_details = None
824851
usage = UsageInfo(
825852
prompt_tokens=total_prompt_tokens,
826853
completion_tokens=total_completion_tokens,
827854
total_tokens=total_prompt_tokens + total_completion_tokens,
855+
prompt_tokens_details=prompt_tokens_details,
828856
)
829857

830858
final_usage_chunk = CompletionStreamResponse(
831859
id=content["meta_info"]["id"],
860+
created=created,
832861
choices=[],
833862
model=request.model,
834863
usage=usage,
@@ -859,7 +888,13 @@ async def generate_stream_resp():
859888
if not isinstance(ret, list):
860889
ret = [ret]
861890

862-
response = v1_generate_response(request, ret, tokenizer_manager)
891+
response = v1_generate_response(
892+
request,
893+
ret,
894+
tokenizer_manager,
895+
created,
896+
cache_report=tokenizer_manager.server_args.enable_cache_report,
897+
)
863898
return response
864899

865900

@@ -1045,6 +1080,7 @@ def v1_chat_generate_request(
10451080
def v1_chat_generate_response(
10461081
request,
10471082
ret,
1083+
created,
10481084
to_file=False,
10491085
cache_report=False,
10501086
tool_call_parser=None,
@@ -1196,7 +1232,7 @@ def v1_chat_generate_response(
11961232
# remain the same but if needed we can change that
11971233
"id": ret[i]["meta_info"]["id"],
11981234
"object": "chat.completion",
1199-
"created": int(time.time()),
1235+
"created": created,
12001236
"model": request[i].model,
12011237
"choices": choice,
12021238
"usage": {
@@ -1218,6 +1254,7 @@ def v1_chat_generate_response(
12181254
cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
12191255
response = ChatCompletionResponse(
12201256
id=ret[0]["meta_info"]["id"],
1257+
created=created,
12211258
model=request.model,
12221259
choices=choices,
12231260
usage=UsageInfo(
@@ -1232,9 +1269,12 @@ def v1_chat_generate_response(
12321269
return response
12331270

12341271

1235-
async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1272+
async def v1_chat_completions(
1273+
tokenizer_manager, raw_request: Request, cache_report=False
1274+
):
12361275
request_json = await raw_request.json()
12371276
all_requests = [ChatCompletionRequest(**request_json)]
1277+
created = int(time.time())
12381278
adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
12391279

12401280
if adapted_request.stream:
@@ -1247,6 +1287,7 @@ async def generate_stream_resp():
12471287
n_prev_tokens = {}
12481288
prompt_tokens = {}
12491289
completion_tokens = {}
1290+
cached_tokens = {}
12501291
try:
12511292
async for content in tokenizer_manager.generate_request(
12521293
adapted_request, raw_request
@@ -1260,6 +1301,7 @@ async def generate_stream_resp():
12601301

12611302
prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
12621303
completion_tokens[index] = content["meta_info"]["completion_tokens"]
1304+
cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
12631305
if request.logprobs:
12641306
logprobs = to_openai_style_logprobs(
12651307
output_token_logprobs=content["meta_info"][
@@ -1339,6 +1381,7 @@ async def generate_stream_resp():
13391381
)
13401382
chunk = ChatCompletionStreamResponse(
13411383
id=content["meta_info"]["id"],
1384+
created=created,
13421385
choices=[choice_data],
13431386
model=request.model,
13441387
)
@@ -1378,6 +1421,7 @@ async def generate_stream_resp():
13781421
)
13791422
chunk = ChatCompletionStreamResponse(
13801423
id=content["meta_info"]["id"],
1424+
created=created,
13811425
choices=[choice_data],
13821426
model=request.model,
13831427
)
@@ -1414,6 +1458,7 @@ async def generate_stream_resp():
14141458
)
14151459
chunk = ChatCompletionStreamResponse(
14161460
id=content["meta_info"]["id"],
1461+
created=created,
14171462
choices=[choice_data],
14181463
model=request.model,
14191464
)
@@ -1464,6 +1509,7 @@ async def generate_stream_resp():
14641509
)
14651510
chunk = ChatCompletionStreamResponse(
14661511
id=content["meta_info"]["id"],
1512+
created=created,
14671513
choices=[choice_data],
14681514
model=request.model,
14691515
)
@@ -1491,6 +1537,7 @@ async def generate_stream_resp():
14911537
)
14921538
chunk = ChatCompletionStreamResponse(
14931539
id=content["meta_info"]["id"],
1540+
created=created,
14941541
choices=[choice_data],
14951542
model=request.model,
14961543
)
@@ -1506,14 +1553,24 @@ async def generate_stream_resp():
15061553
total_completion_tokens = sum(
15071554
tokens for tokens in completion_tokens.values()
15081555
)
1556+
cache_report = tokenizer_manager.server_args.enable_cache_report
1557+
if cache_report:
1558+
cached_tokens_sum = sum(
1559+
tokens for tokens in cached_tokens.values()
1560+
)
1561+
prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
1562+
else:
1563+
prompt_tokens_details = None
15091564
usage = UsageInfo(
15101565
prompt_tokens=total_prompt_tokens,
15111566
completion_tokens=total_completion_tokens,
15121567
total_tokens=total_prompt_tokens + total_completion_tokens,
1568+
prompt_tokens_details=prompt_tokens_details,
15131569
)
15141570

15151571
final_usage_chunk = ChatCompletionStreamResponse(
15161572
id=content["meta_info"]["id"],
1573+
created=created,
15171574
choices=[],
15181575
model=request.model,
15191576
usage=usage,
@@ -1546,6 +1603,7 @@ async def generate_stream_resp():
15461603
response = v1_chat_generate_response(
15471604
request,
15481605
ret,
1606+
created,
15491607
cache_report=tokenizer_manager.server_args.enable_cache_report,
15501608
tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
15511609
reasoning_parser=tokenizer_manager.server_args.reasoning_parser,

0 commit comments

Comments
 (0)