@@ -314,20 +314,27 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
314
314
)
315
315
316
316
try :
317
+ created = int (time .time ())
317
318
ret = await tokenizer_manager .generate_request (adapted_request ).__anext__ ()
318
319
if not isinstance (ret , list ):
319
320
ret = [ret ]
320
321
if end_point == "/v1/chat/completions" :
321
322
responses = v1_chat_generate_response (
322
323
request ,
323
324
ret ,
325
+ created ,
324
326
to_file = True ,
325
327
cache_report = tokenizer_manager .server_args .enable_cache_report ,
326
328
tool_call_parser = tokenizer_manager .server_args .tool_call_parser ,
327
329
)
328
330
else :
329
331
responses = v1_generate_response (
330
- request , ret , tokenizer_manager , to_file = True
332
+ request ,
333
+ ret ,
334
+ tokenizer_manager ,
335
+ created ,
336
+ to_file = True ,
337
+ cache_report = tokenizer_manager .server_args .enable_cache_report ,
331
338
)
332
339
333
340
except Exception as e :
@@ -577,7 +584,9 @@ def v1_generate_request(
577
584
return adapted_request , all_requests if len (all_requests ) > 1 else all_requests [0 ]
578
585
579
586
580
- def v1_generate_response (request , ret , tokenizer_manager , to_file = False ):
587
+ def v1_generate_response (
588
+ request , ret , tokenizer_manager , created , to_file = False , cache_report = False
589
+ ):
581
590
choices = []
582
591
echo = False
583
592
@@ -675,7 +684,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
675
684
# remain the same but if needed we can change that
676
685
"id" : ret [i ]["meta_info" ]["id" ],
677
686
"object" : "text_completion" ,
678
- "created" : int ( time . time ()) ,
687
+ "created" : created ,
679
688
"model" : request [i ].model ,
680
689
"choices" : choice ,
681
690
"usage" : {
@@ -694,14 +703,19 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
694
703
ret [i ]["meta_info" ]["prompt_tokens" ] for i in range (0 , len (ret ), request .n )
695
704
)
696
705
completion_tokens = sum (item ["meta_info" ]["completion_tokens" ] for item in ret )
706
+ cached_tokens = sum (item ["meta_info" ].get ("cached_tokens" , 0 ) for item in ret )
697
707
response = CompletionResponse (
698
708
id = ret [0 ]["meta_info" ]["id" ],
699
709
model = request .model ,
710
+ created = created ,
700
711
choices = choices ,
701
712
usage = UsageInfo (
702
713
prompt_tokens = prompt_tokens ,
703
714
completion_tokens = completion_tokens ,
704
715
total_tokens = prompt_tokens + completion_tokens ,
716
+ prompt_tokens_details = (
717
+ {"cached_tokens" : cached_tokens } if cache_report else None
718
+ ),
705
719
),
706
720
)
707
721
return response
@@ -710,6 +724,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
710
724
async def v1_completions (tokenizer_manager , raw_request : Request ):
711
725
request_json = await raw_request .json ()
712
726
all_requests = [CompletionRequest (** request_json )]
727
+ created = int (time .time ())
713
728
adapted_request , request = v1_generate_request (all_requests )
714
729
715
730
if adapted_request .stream :
@@ -719,6 +734,8 @@ async def generate_stream_resp():
719
734
n_prev_tokens = {}
720
735
prompt_tokens = {}
721
736
completion_tokens = {}
737
+ cached_tokens = {}
738
+
722
739
try :
723
740
async for content in tokenizer_manager .generate_request (
724
741
adapted_request , raw_request
@@ -731,6 +748,7 @@ async def generate_stream_resp():
731
748
text = content ["text" ]
732
749
prompt_tokens [index ] = content ["meta_info" ]["prompt_tokens" ]
733
750
completion_tokens [index ] = content ["meta_info" ]["completion_tokens" ]
751
+ cached_tokens [index ] = content ["meta_info" ].get ("cached_tokens" , 0 )
734
752
735
753
if not stream_buffer : # The first chunk
736
754
if request .echo :
@@ -803,6 +821,7 @@ async def generate_stream_resp():
803
821
)
804
822
chunk = CompletionStreamResponse (
805
823
id = content ["meta_info" ]["id" ],
824
+ created = created ,
806
825
object = "text_completion" ,
807
826
choices = [choice_data ],
808
827
model = request .model ,
@@ -821,14 +840,24 @@ async def generate_stream_resp():
821
840
total_completion_tokens = sum (
822
841
tokens for tokens in completion_tokens .values ()
823
842
)
843
+ cache_report = tokenizer_manager .server_args .enable_cache_report
844
+ if cache_report :
845
+ cached_tokens_sum = sum (
846
+ tokens for tokens in cached_tokens .values ()
847
+ )
848
+ prompt_tokens_details = {"cached_tokens" : cached_tokens_sum }
849
+ else :
850
+ prompt_tokens_details = None
824
851
usage = UsageInfo (
825
852
prompt_tokens = total_prompt_tokens ,
826
853
completion_tokens = total_completion_tokens ,
827
854
total_tokens = total_prompt_tokens + total_completion_tokens ,
855
+ prompt_tokens_details = prompt_tokens_details ,
828
856
)
829
857
830
858
final_usage_chunk = CompletionStreamResponse (
831
859
id = content ["meta_info" ]["id" ],
860
+ created = created ,
832
861
choices = [],
833
862
model = request .model ,
834
863
usage = usage ,
@@ -859,7 +888,13 @@ async def generate_stream_resp():
859
888
if not isinstance (ret , list ):
860
889
ret = [ret ]
861
890
862
- response = v1_generate_response (request , ret , tokenizer_manager )
891
+ response = v1_generate_response (
892
+ request ,
893
+ ret ,
894
+ tokenizer_manager ,
895
+ created ,
896
+ cache_report = tokenizer_manager .server_args .enable_cache_report ,
897
+ )
863
898
return response
864
899
865
900
@@ -1045,6 +1080,7 @@ def v1_chat_generate_request(
1045
1080
def v1_chat_generate_response (
1046
1081
request ,
1047
1082
ret ,
1083
+ created ,
1048
1084
to_file = False ,
1049
1085
cache_report = False ,
1050
1086
tool_call_parser = None ,
@@ -1196,7 +1232,7 @@ def v1_chat_generate_response(
1196
1232
# remain the same but if needed we can change that
1197
1233
"id" : ret [i ]["meta_info" ]["id" ],
1198
1234
"object" : "chat.completion" ,
1199
- "created" : int ( time . time ()) ,
1235
+ "created" : created ,
1200
1236
"model" : request [i ].model ,
1201
1237
"choices" : choice ,
1202
1238
"usage" : {
@@ -1218,6 +1254,7 @@ def v1_chat_generate_response(
1218
1254
cached_tokens = sum (item ["meta_info" ].get ("cached_tokens" , 0 ) for item in ret )
1219
1255
response = ChatCompletionResponse (
1220
1256
id = ret [0 ]["meta_info" ]["id" ],
1257
+ created = created ,
1221
1258
model = request .model ,
1222
1259
choices = choices ,
1223
1260
usage = UsageInfo (
@@ -1232,9 +1269,12 @@ def v1_chat_generate_response(
1232
1269
return response
1233
1270
1234
1271
1235
- async def v1_chat_completions (tokenizer_manager , raw_request : Request ):
1272
+ async def v1_chat_completions (
1273
+ tokenizer_manager , raw_request : Request , cache_report = False
1274
+ ):
1236
1275
request_json = await raw_request .json ()
1237
1276
all_requests = [ChatCompletionRequest (** request_json )]
1277
+ created = int (time .time ())
1238
1278
adapted_request , request = v1_chat_generate_request (all_requests , tokenizer_manager )
1239
1279
1240
1280
if adapted_request .stream :
@@ -1247,6 +1287,7 @@ async def generate_stream_resp():
1247
1287
n_prev_tokens = {}
1248
1288
prompt_tokens = {}
1249
1289
completion_tokens = {}
1290
+ cached_tokens = {}
1250
1291
try :
1251
1292
async for content in tokenizer_manager .generate_request (
1252
1293
adapted_request , raw_request
@@ -1260,6 +1301,7 @@ async def generate_stream_resp():
1260
1301
1261
1302
prompt_tokens [index ] = content ["meta_info" ]["prompt_tokens" ]
1262
1303
completion_tokens [index ] = content ["meta_info" ]["completion_tokens" ]
1304
+ cached_tokens [index ] = content ["meta_info" ].get ("cached_tokens" , 0 )
1263
1305
if request .logprobs :
1264
1306
logprobs = to_openai_style_logprobs (
1265
1307
output_token_logprobs = content ["meta_info" ][
@@ -1339,6 +1381,7 @@ async def generate_stream_resp():
1339
1381
)
1340
1382
chunk = ChatCompletionStreamResponse (
1341
1383
id = content ["meta_info" ]["id" ],
1384
+ created = created ,
1342
1385
choices = [choice_data ],
1343
1386
model = request .model ,
1344
1387
)
@@ -1378,6 +1421,7 @@ async def generate_stream_resp():
1378
1421
)
1379
1422
chunk = ChatCompletionStreamResponse (
1380
1423
id = content ["meta_info" ]["id" ],
1424
+ created = created ,
1381
1425
choices = [choice_data ],
1382
1426
model = request .model ,
1383
1427
)
@@ -1414,6 +1458,7 @@ async def generate_stream_resp():
1414
1458
)
1415
1459
chunk = ChatCompletionStreamResponse (
1416
1460
id = content ["meta_info" ]["id" ],
1461
+ created = created ,
1417
1462
choices = [choice_data ],
1418
1463
model = request .model ,
1419
1464
)
@@ -1464,6 +1509,7 @@ async def generate_stream_resp():
1464
1509
)
1465
1510
chunk = ChatCompletionStreamResponse (
1466
1511
id = content ["meta_info" ]["id" ],
1512
+ created = created ,
1467
1513
choices = [choice_data ],
1468
1514
model = request .model ,
1469
1515
)
@@ -1491,6 +1537,7 @@ async def generate_stream_resp():
1491
1537
)
1492
1538
chunk = ChatCompletionStreamResponse (
1493
1539
id = content ["meta_info" ]["id" ],
1540
+ created = created ,
1494
1541
choices = [choice_data ],
1495
1542
model = request .model ,
1496
1543
)
@@ -1506,14 +1553,24 @@ async def generate_stream_resp():
1506
1553
total_completion_tokens = sum (
1507
1554
tokens for tokens in completion_tokens .values ()
1508
1555
)
1556
+ cache_report = tokenizer_manager .server_args .enable_cache_report
1557
+ if cache_report :
1558
+ cached_tokens_sum = sum (
1559
+ tokens for tokens in cached_tokens .values ()
1560
+ )
1561
+ prompt_tokens_details = {"cached_tokens" : cached_tokens_sum }
1562
+ else :
1563
+ prompt_tokens_details = None
1509
1564
usage = UsageInfo (
1510
1565
prompt_tokens = total_prompt_tokens ,
1511
1566
completion_tokens = total_completion_tokens ,
1512
1567
total_tokens = total_prompt_tokens + total_completion_tokens ,
1568
+ prompt_tokens_details = prompt_tokens_details ,
1513
1569
)
1514
1570
1515
1571
final_usage_chunk = ChatCompletionStreamResponse (
1516
1572
id = content ["meta_info" ]["id" ],
1573
+ created = created ,
1517
1574
choices = [],
1518
1575
model = request .model ,
1519
1576
usage = usage ,
@@ -1546,6 +1603,7 @@ async def generate_stream_resp():
1546
1603
response = v1_chat_generate_response (
1547
1604
request ,
1548
1605
ret ,
1606
+ created ,
1549
1607
cache_report = tokenizer_manager .server_args .enable_cache_report ,
1550
1608
tool_call_parser = tokenizer_manager .server_args .tool_call_parser ,
1551
1609
reasoning_parser = tokenizer_manager .server_args .reasoning_parser ,
0 commit comments