Skip to content

Commit 9e198dc

Browse files
Merge branch 'release/2.1' into Jiang-Jia-Jun-patch-2
2 parents fc877ed + d259275 commit 9e198dc

File tree

7 files changed

+37
-15
lines changed

7 files changed

+37
-15
lines changed

fastdeploy/cache_manager/prefix_cache_manager.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,10 @@ def __init__(
6464
self.speculative_config = config.speculative_config
6565
self.local_data_parallel_id = local_data_parallel_id
6666

67-
self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
67+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
68+
self.num_gpu_blocks = self.cache_config.total_block_num
69+
else:
70+
self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
6871
self.num_cpu_blocks = self.cache_config.num_cpu_blocks
6972
self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1))
7073
if self.num_cpu_blocks > 0:

fastdeploy/config.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -726,7 +726,10 @@ def __init__(self, args):
726726
self.block_size = 64
727727
self.gpu_memory_utilization = 0.9
728728
self.num_gpu_blocks_override = None
729-
self.kv_cache_ratio = 0.75
729+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
730+
self.kv_cache_ratio = 1.0
731+
else:
732+
self.kv_cache_ratio = 0.75
730733
self.enc_dec_block_num = 2
731734
self.prealloc_dec_block_slot_num_threshold = 5
732735
self.cache_dtype = "bfloat16"
@@ -811,7 +814,10 @@ def postprocess(self, num_total_tokens, number_of_tasks):
811814
self.dec_token_num = self.enc_dec_block_num * self.block_size
812815
if self.num_gpu_blocks_override is not None:
813816
self.total_block_num = self.num_gpu_blocks_override
814-
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
817+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
818+
self.prefill_kvcache_block_num = self.total_block_num
819+
else:
820+
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
815821
else:
816822
length = num_total_tokens // number_of_tasks
817823
block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size
@@ -824,7 +830,10 @@ def reset(self, num_gpu_blocks):
824830
reset gpu block number
825831
"""
826832
self.total_block_num = num_gpu_blocks
827-
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
833+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
834+
self.prefill_kvcache_block_num = self.total_block_num
835+
else:
836+
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
828837
logger.info(
829838
f"Reset block num, the total_block_num:{self.total_block_num},"
830839
f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"

fastdeploy/engine/args_utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from dataclasses import asdict, dataclass
1919
from dataclasses import fields as dataclass_fields
2020
from typing import Any, Dict, List, Optional
21+
import os
2122

2223
from fastdeploy.config import (
2324
CacheConfig,
@@ -865,7 +866,10 @@ def create_engine_config(self) -> Config:
865866
if self.enable_chunked_prefill:
866867
self.max_num_batched_tokens = 2048
867868
else:
868-
self.max_num_batched_tokens = self.max_model_len
869+
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
870+
self.max_num_batched_tokens = self.max_model_len
871+
else:
872+
self.max_num_batched_tokens = 8192
869873

870874
all_dict = asdict(self)
871875
all_dict["model_cfg"] = model_cfg

fastdeploy/engine/config.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,10 @@ def postprocess(self):
236236
if self.cache_config.enable_chunked_prefill:
237237
self.max_num_batched_tokens = 2048
238238
else:
239-
self.max_num_batched_tokens = self.max_model_len
239+
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
240+
self.max_num_batched_tokens = self.max_model_len
241+
else:
242+
self.max_num_batched_tokens = 8192
240243

241244
if self.long_prefill_token_threshold == 0:
242245
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
@@ -284,10 +287,11 @@ def check(self):
284287
)
285288

286289
if not self.cache_config.enable_chunked_prefill:
287-
assert self.max_num_batched_tokens >= self.max_model_len, (
288-
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
289-
f"should be larger than or equal to max_model_len: {self.max_model_len}"
290-
)
290+
if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')):
291+
assert self.max_num_batched_tokens >= self.max_model_len, (
292+
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
293+
f"should be larger than or equal to max_model_len: {self.max_model_len}"
294+
)
291295
else:
292296
assert self.max_num_batched_tokens >= self.cache_config.block_size, (
293297
f"max_num_batched_tokens: {self.max_num_batched_tokens} "

fastdeploy/engine/sched/resource_manager_v1.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,6 @@ def _get_num_new_tokens(self, request, token_budget):
195195
)
196196
request.num_image_end = img_num_per_boundary[new_boundary_idx]
197197

198-
request.num_image_end = img_num_per_boundary[new_boundary_idx]
199198
request.image_type_ids_start = np.sum(grid_thw[: request.num_image_start, 0])
200199
request.image_type_ids_end = np.sum(grid_thw[: request.num_image_end, 0])
201200
request.image_start = np.sum(np.prod(grid_thw[: request.num_image_start], axis=1))

fastdeploy/output/token_processor.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -463,8 +463,9 @@ def _process_batch_output(self):
463463
if recovery_stop:
464464
llm_logger.info(f"recovery stop signal found at task {task_id}")
465465
if not recovery_stop and token_id < 0:
466-
if task_id in self.resource_manager.to_be_rescheduled_request_id_set:
467-
self.resource_manager.reschedule_preempt_task(task_id)
466+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
467+
if task_id in self.resource_manager.to_be_rescheduled_request_id_set:
468+
self.resource_manager.reschedule_preempt_task(task_id)
468469
continue
469470

470471
if task.get("prefill_chunk_info", None) is not None:

fastdeploy/worker/worker_process.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import paddle.distributed as dist
2525
from paddle.distributed import fleet
2626

27+
from fastdeploy import envs
2728
from fastdeploy.config import (
2829
CacheConfig,
2930
DecodingConfig,
@@ -289,8 +290,9 @@ def event_loop_normal(self) -> None:
289290
if self.local_rank % mp_num_per_node == 0:
290291
if self.task_queue.num_tasks() > 0:
291292
# VL only support 1 batch to prefill
292-
293-
if not self.fd_config.model_config.enable_mm or not self.worker.exist_prefill():
293+
if envs.ENABLE_V1_KVCACHE_SCHEDULER or not (
294+
self.fd_config.model_config.enable_mm and self.worker.exist_prefill()
295+
):
294296
if self.nnode > 1 and self.parallel_config.tensor_parallel_size > self.max_chips_per_node:
295297
self.task_queue.read_finish_flag.set(1)
296298
else:

0 commit comments

Comments
 (0)