Skip to content

Commit c6a133d

Browse files
authored
[Bug fix] Fix block num in scheduler v1 for release2.0.4 (#3314)
* fix bug for scheduler v0 * fix block num setting in scheduler v1 * fix block num setting in scheduler v1 * fix block num setting in scheduler v1 * fix block num setting in scheduler v1 * fix block num setting in scheduler v1
1 parent 4646aff commit c6a133d

File tree

5 files changed

+34
-12
lines changed

5 files changed

+34
-12
lines changed

fastdeploy/cache_manager/prefix_cache_manager.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,10 @@ def __init__(
6464
self.speculative_config = config.speculative_config
6565
self.local_data_parallel_id = local_data_parallel_id
6666

67-
self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
67+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
68+
self.num_gpu_blocks = self.cache_config.total_block_num
69+
else:
70+
self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
6871
self.num_cpu_blocks = self.cache_config.num_cpu_blocks
6972
self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1))
7073
if self.num_cpu_blocks > 0:

fastdeploy/engine/args_utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from dataclasses import asdict, dataclass
1919
from dataclasses import fields as dataclass_fields
2020
from typing import Any, Dict, List, Optional
21+
import os
2122

2223
from fastdeploy.engine.config import (
2324
CacheConfig,
@@ -854,7 +855,10 @@ def create_engine_config(self) -> Config:
854855
if self.enable_chunked_prefill:
855856
self.max_num_batched_tokens = 2048
856857
else:
857-
self.max_num_batched_tokens = self.max_model_len
858+
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
859+
self.max_num_batched_tokens = self.max_model_len
860+
else:
861+
self.max_num_batched_tokens = 8192
858862
scheduler_cfg = self.create_scheduler_config()
859863
speculative_cfg = self.create_speculative_config()
860864
graph_opt_cfg = self.create_graph_optimization_config()

fastdeploy/engine/config.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,8 @@ def __init__(
211211
self.gpu_memory_utilization = gpu_memory_utilization
212212
self.num_gpu_blocks_override = num_gpu_blocks_override
213213
self.kv_cache_ratio = kv_cache_ratio
214+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
215+
self.kv_cache_ratio = 1.0
214216
self.enc_dec_block_num = enc_dec_block_num
215217
self.prealloc_dec_block_slot_num_threshold = prealloc_dec_block_slot_num_threshold
216218
self.cache_dtype = cache_dtype
@@ -291,7 +293,10 @@ def postprocess(self, num_total_tokens, number_of_tasks):
291293
self.dec_token_num = self.enc_dec_block_num * self.block_size
292294
if self.num_gpu_blocks_override is not None:
293295
self.total_block_num = self.num_gpu_blocks_override
294-
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
296+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
297+
self.prefill_kvcache_block_num = self.total_block_num
298+
else:
299+
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
295300
else:
296301
length = num_total_tokens // number_of_tasks
297302
block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size
@@ -304,7 +309,10 @@ def reset(self, num_gpu_blocks):
304309
reset gpu block number
305310
"""
306311
self.total_block_num = num_gpu_blocks
307-
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
312+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
313+
self.prefill_kvcache_block_num = self.total_block_num
314+
else:
315+
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
308316
llm_logger.info(
309317
f"Reset block num, the total_block_num:{self.total_block_num},"
310318
f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"
@@ -796,7 +804,10 @@ def postprocess(self):
796804
if self.cache_config.enable_chunked_prefill:
797805
self.max_num_batched_tokens = 2048
798806
else:
799-
self.max_num_batched_tokens = self.max_model_len
807+
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
808+
self.max_num_batched_tokens = self.max_model_len
809+
else:
810+
self.max_num_batched_tokens = 8192
800811

801812
if self.long_prefill_token_threshold == 0:
802813
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
@@ -844,10 +855,11 @@ def check(self):
844855
)
845856

846857
if not self.cache_config.enable_chunked_prefill:
847-
assert self.max_num_batched_tokens >= self.max_model_len, (
848-
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
849-
f"should be larger than or equal to max_model_len: {self.max_model_len}"
850-
)
858+
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
859+
assert self.max_num_batched_tokens >= self.max_model_len, (
860+
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
861+
f"should be larger than or equal to max_model_len: {self.max_model_len}"
862+
)
851863
else:
852864
assert self.max_num_batched_tokens >= self.cache_config.block_size, (
853865
f"max_num_batched_tokens: {self.max_num_batched_tokens} "

fastdeploy/engine/sched/resource_manager_v1.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -234,8 +234,7 @@ def schedule(self):
234234
llm_logger.debug(
235235
f"scheduler prefill task: {request} request.need_prefill_tokens {request.need_prefill_tokens} request.num_computed_tokens {request.num_computed_tokens}"
236236
)
237-
num_new_tokens = request.prompt_token_ids_len - request.num_computed_tokens
238-
num_new_tokens = min(num_new_tokens, token_budget)
237+
num_new_tokens = self._get_num_new_tokens(request, token_budget)
239238
num_new_block = self.get_new_block_nums(request, num_new_tokens)
240239
# Allocate blocks to prefill
241240
if self.cache_manager.can_allocate_gpu_blocks(num_new_block):

fastdeploy/worker/gpu_model_runner.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,11 +208,15 @@ def insert_tasks_v1(self, req_dicts: List[Request]):
208208
request = req_dicts[i]
209209
idx = request.idx
210210
if request.task_type.value == RequestType.PREFILL.value: # prefill task
211-
logger.debug(f"Handle prefill request {request} at idx {idx}")
212211
prefill_start_index = request.prefill_start_index
213212
prefill_end_index = request.prefill_end_index
214213
length = prefill_end_index - prefill_start_index
215214
input_ids = request.prompt_token_ids + request.output_token_ids
215+
logger.debug(
216+
f"Handle prefill request {request} at idx {idx}, "
217+
f"{prefill_start_index=}, {prefill_end_index=}, "
218+
f"need_prefilled_token_num={len(input_ids)}"
219+
)
216220
self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array(
217221
input_ids[prefill_start_index:prefill_end_index]
218222
)

0 commit comments

Comments
 (0)