diff --git a/custom_ops/cpu_ops/get_padding_offset.cc b/custom_ops/cpu_ops/get_padding_offset.cc index 8fe73bc8e4..02ee71a263 100644 --- a/custom_ops/cpu_ops/get_padding_offset.cc +++ b/custom_ops/cpu_ops/get_padding_offset.cc @@ -84,7 +84,6 @@ std::vector GetPaddingOffset(const paddle::Tensor &input_ids, seq_length, bsz); return {x_remove_padding, - cum_offsets_out, padding_offset, cu_seqlens_q, cu_seqlens_k}; @@ -97,7 +96,7 @@ std::vector> GetPaddingOffsetInferShape( const std::vector &seq_len_shape) { int64_t bsz = seq_len_shape[0]; int64_t seq_len = input_ids_shape[1]; - return {{-1}, {bsz}, {-1}, {bsz + 1}, {bsz + 1}}; + return {{-1}, {-1}, {bsz + 1}, {bsz + 1}}; } std::vector GetPaddingOffsetInferDtype( @@ -106,7 +105,6 @@ std::vector GetPaddingOffsetInferDtype( const paddle::DataType &token_num_dtype, const paddle::DataType &seq_len_dtype) { return {input_ids_dtype, - seq_len_dtype, seq_len_dtype, seq_len_dtype, seq_len_dtype}; @@ -115,7 +113,6 @@ std::vector GetPaddingOffsetInferDtype( PD_BUILD_STATIC_OP(get_padding_offset_cpu) .Inputs({"input_ids", "cum_offsets", "token_num", "seq_len"}) .Outputs({"x_remove_padding", - "cum_offsets_out", "padding_offset", "cu_seqlens_q", "cu_seqlens_k"}) diff --git a/custom_ops/cpu_ops/rebuild_padding.cc b/custom_ops/cpu_ops/rebuild_padding.cc index 8ce533d041..2dfc9f17e2 100644 --- a/custom_ops/cpu_ops/rebuild_padding.cc +++ b/custom_ops/cpu_ops/rebuild_padding.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -19,10 +19,11 @@ #define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) #endif + template void RebuildPaddingCPUImpl(T *output_data, const T *input_data, - const int *cum_offsets_data, + const int *cu_seqlens_q_data, const int *seq_len_this_time_data, const int *seq_lens_decoder_data, const int *seq_lens_encoder_data, @@ -40,11 +41,12 @@ void RebuildPaddingCPUImpl(T *output_data, if (seq_lens_decoder_data[bi] == 0 && seq_lens_encoder_data[bi] == 0) { continue; } + if (seq_lens_encoder_data[bi] > 0) { seq_id = seq_lens_encoder_data[bi] - 1; } - const int ori_token_idx = - bi * max_input_length - cum_offsets_data[bi] + seq_id; + + const int ori_token_idx = cu_seqlens_q_data[bi] + seq_id; const int src_offset = ori_token_idx * dim_embed + bias_idx; output_data[i] = input_data[src_offset]; @@ -54,7 +56,7 @@ void RebuildPaddingCPUImpl(T *output_data, template void RebuildAppendPaddingCPUImpl(T *output_data, const T *input_data, - const int *cum_offsets_data, + const int *cu_seqlens_q_data, const int *seq_len_this_time_data, const int *seq_lens_decoder_data, const int *seq_lens_encoder_data, @@ -69,30 +71,32 @@ void RebuildAppendPaddingCPUImpl(T *output_data, int bi = ori_token_id / max_input_length; if (seq_len_this_time_data[bi] == 0 || (seq_lens_decoder_data[bi] == 0 && - seq_lens_encoder_data[bi] == 0)) { - continue; - } + seq_lens_encoder_data[bi] == 0)) { + continue; + } int seq_id = 0; + if (seq_lens_encoder_data[bi] > 0) { seq_id = seq_lens_encoder_data[bi] - 1; } - int input_token_id = ori_token_id - cum_offsets_data[bi] + seq_id; + int input_token_id = cu_seqlens_q_data[bi] + seq_id; int bias_idx = i % dim_embed; int src_offset = input_token_id * dim_embed + bias_idx; + output_data[i] = input_data[src_offset]; } } std::vector RebuildPaddingCPU( const paddle::Tensor &tmp_out, - const paddle::Tensor &cum_offsets, + const paddle::Tensor &cu_seqlens_q, const paddle::Tensor &seq_len_this_time, const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &seq_lens_encoder, const paddle::optional &output_padding_offset, int max_input_length) { auto tmp_out_cpu = tmp_out.copy_to(paddle::CPUPlace(), true); - auto cum_offsets_cpu = cum_offsets.copy_to(paddle::CPUPlace(), true); + auto cu_seqlens_q_cpu = cu_seqlens_q.copy_to(paddle::CPUPlace(), true); auto seq_len_this_time_cpu = seq_len_this_time.copy_to(paddle::CPUPlace(), true); auto seq_lens_decoder_cpu = @@ -107,7 +111,7 @@ std::vector RebuildPaddingCPU( int token_num = tmp_out_cpu.shape()[0]; int dim_embed = tmp_out_cpu.shape()[1]; - int bsz = cum_offsets_cpu.shape()[0]; + int bsz = cu_seqlens_q_cpu.shape()[0] - 1; paddle::Tensor out; if (output_padding_offset_cpu) { @@ -128,7 +132,7 @@ std::vector RebuildPaddingCPU( {bsz, dim_embed}, 0, tmp_out_cpu.dtype(), paddle::CPUPlace()); } - const int *cum_offsets_data = cum_offsets_cpu.data(); + const int *cu_seqlens_q_data = cu_seqlens_q_cpu.data(); const int *seq_len_this_time_data = seq_len_this_time_cpu.data(); const int *seq_lens_decoder_data = seq_lens_decoder_cpu.data(); const int *seq_lens_encoder_data = seq_lens_encoder_cpu.data(); @@ -141,7 +145,7 @@ std::vector RebuildPaddingCPU( case paddle::DataType::FLOAT32: RebuildAppendPaddingCPUImpl(out.data(), tmp_out_cpu.data(), - cum_offsets_data, + cu_seqlens_q_data, seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -154,7 +158,7 @@ std::vector RebuildPaddingCPU( RebuildAppendPaddingCPUImpl( out.data(), tmp_out_cpu.data(), - cum_offsets_data, + cu_seqlens_q_data, seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -167,7 +171,7 @@ std::vector RebuildPaddingCPU( RebuildAppendPaddingCPUImpl( out.data(), tmp_out_cpu.data(), - cum_offsets_data, + cu_seqlens_q_data, seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -186,7 +190,7 @@ std::vector RebuildPaddingCPU( case paddle::DataType::FLOAT32: RebuildPaddingCPUImpl(out.data(), tmp_out_cpu.data(), - cum_offsets_data, + cu_seqlens_q_data, seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -198,7 +202,7 @@ std::vector RebuildPaddingCPU( RebuildPaddingCPUImpl( out.data(), tmp_out_cpu.data(), - cum_offsets_data, + cu_seqlens_q_data, seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -207,11 +211,10 @@ std::vector RebuildPaddingCPU( elem_nums); break; case paddle::DataType::BFLOAT16: - RebuildPaddingCPUImpl( out.data(), tmp_out_cpu.data(), - cum_offsets_data, + cu_seqlens_q_data, seq_len_this_time_data, seq_lens_decoder_data, seq_lens_encoder_data, @@ -230,7 +233,7 @@ std::vector RebuildPaddingCPU( std::vector> RebuildPaddingInferShape( const std::vector &tmp_out_shape, - const std::vector &cum_offsets_shape, + const std::vector &cu_seqlens_q_shape, const std::vector &seq_len_this_time_shape, const std::vector &seq_lens_decoder_shape, const std::vector &seq_lens_encoder_shape, @@ -239,14 +242,14 @@ std::vector> RebuildPaddingInferShape( if (output_padding_offset_shape) { return {{-1, dim_embed}}; } else { - int64_t bsz = cum_offsets_shape[0]; + int64_t bsz = cu_seqlens_q_shape[0] - 1; return {{bsz, dim_embed}}; } } std::vector RebuildPaddingInferDtype( const paddle::DataType &tmp_out_dtype, - const paddle::DataType &cum_offsets_dtype, + const paddle::DataType &cu_seqlens_q_dtype, const paddle::DataType &seq_len_this_time_dtype, const paddle::DataType &seq_lens_decoder_dtype, const paddle::DataType &seq_lens_encoder_dtype, @@ -256,7 +259,7 @@ std::vector RebuildPaddingInferDtype( PD_BUILD_STATIC_OP(rebuild_padding_cpu) .Inputs({"tmp_out", - "cum_offsets", + "cu_seqlens_q", "seq_len_this_time", "seq_lens_decoder", "seq_lens_encoder", diff --git a/custom_ops/gpu_ops/get_padding_offset.cu b/custom_ops/gpu_ops/get_padding_offset.cu index 8fae9b88c3..f505e1c326 100644 --- a/custom_ops/gpu_ops/get_padding_offset.cu +++ b/custom_ops/gpu_ops/get_padding_offset.cu @@ -101,7 +101,6 @@ std::vector GetPaddingOffset(const paddle::Tensor &input_ids, cum_offsets_out.data(), seq_length); return {x_remove_padding, - cum_offsets_out, batch_id_per_token, cu_seqlens_q, cu_seqlens_k}; // , enc_token_num, dec_token_num}; @@ -114,7 +113,7 @@ std::vector> GetPaddingOffsetInferShape( const std::vector &seq_len_shape) { int64_t bsz = seq_len_shape[0]; int64_t seq_len = input_ids_shape[1]; - return {{-1}, {bsz}, {-1}, {bsz + 1}, {bsz + 1}}; + return {{-1}, {-1}, {bsz + 1}, {bsz + 1}}; } std::vector GetPaddingOffsetInferDtype( @@ -123,7 +122,6 @@ std::vector GetPaddingOffsetInferDtype( const paddle::DataType &token_num_dtype, const paddle::DataType &seq_len_dtype) { return {input_ids_dtype, - seq_len_dtype, seq_len_dtype, seq_len_dtype, seq_len_dtype}; @@ -132,7 +130,6 @@ std::vector GetPaddingOffsetInferDtype( PD_BUILD_STATIC_OP(get_padding_offset) .Inputs({"input_ids", "token_num", "cum_offsets", "seq_len"}) .Outputs({"x_remove_padding", - "cum_offsets_out", "batch_id_per_token", "cu_seqlens_q", "cu_seqlens_k"}) diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu index 3d69e9e459..93c1bb38c2 100644 --- a/custom_ops/gpu_ops/rebuild_padding.cu +++ b/custom_ops/gpu_ops/rebuild_padding.cu @@ -17,7 +17,7 @@ template __global__ void RebuildPaddingKernel(T *output_data, const T *input_data, - const int *cum_offsets, + const int *cu_seqlens_q, const int *seq_len_this_time, const int *seq_len_decoder, const int *seq_len_encoder, @@ -34,10 +34,10 @@ __global__ void RebuildPaddingKernel(T *output_data, int seq_id = 0; if (seq_len_this_time[bi] == 0) continue; if (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0) continue; - // if encoder, get last token; just decoder, get first token. if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1; + const int ori_token_idx = - bi * max_input_length - cum_offsets[bi] + seq_id; + cu_seqlens_q[bi] + seq_id; const int src_offset = ori_token_idx * dim_embed + bias_idx; Load(&input_data[src_offset], &src_vec); Store(src_vec, &output_data[i]); @@ -47,29 +47,31 @@ __global__ void RebuildPaddingKernel(T *output_data, template __global__ void RebuildAppendPaddingKernel(T *output_data, const T *input_data, - const int *cum_offset, + const int *cu_seqlens_q, const int *seq_len_this_time, const int *seq_len_decoder, const int *seq_len_encoder, const int *output_padding_offset, const int max_input_length, const int dim_embed, - const int64_t output_elem_nums) { + const int64_t output_elem_nums, + const int bsz) { AlignedVector src_vec; const int64_t global_idx = blockDim.x * blockIdx.x + threadIdx.x; for (int64_t i = global_idx * VecSize; i < output_elem_nums; i += gridDim.x * blockDim.x * VecSize) { const int out_token_id = i / dim_embed; - const int ori_token_id = - out_token_id + output_padding_offset[out_token_id]; + const int ori_token_id = out_token_id + output_padding_offset[out_token_id]; + const int bi = ori_token_id / max_input_length; + int seq_id = 0; if (seq_len_this_time[bi] == 0) continue; if (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0) continue; - // if encoder, get last token; just decoder, get first token. - if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1; - const int input_token_id = ori_token_id - cum_offset[bi] + seq_id; + if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1; + const int cum_offset_bi = bi * max_input_length - cu_seqlens_q[bi]; + const int input_token_id = ori_token_id - cum_offset_bi + seq_id; const int bias_idx = i % dim_embed; Load(&input_data[input_token_id * dim_embed + bias_idx], @@ -78,10 +80,11 @@ __global__ void RebuildAppendPaddingKernel(T *output_data, } } + template std::vector rebuild_padding( const paddle::Tensor &tmp_out, // [token_num, dim_embed] - const paddle::Tensor &cum_offsets, // [bsz, 1] + const paddle::Tensor &cu_seqlens_q, // [bsz+1, 1] const paddle::Tensor &seq_len_this_time, const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &seq_lens_encoder, @@ -100,7 +103,7 @@ std::vector rebuild_padding( std::vector tmp_out_shape = tmp_out.shape(); const int token_num = tmp_out_shape[0]; const int dim_embed = tmp_out_shape[1]; - const int bsz = cum_offsets.shape()[0]; + const int bsz = cu_seqlens_q.shape()[0] - 1; paddle::Tensor out; if (output_padding_offset) { @@ -133,21 +136,22 @@ std::vector rebuild_padding( <<>>( reinterpret_cast(out.data()), reinterpret_cast(tmp_out.data()), - cum_offsets.data(), + cu_seqlens_q.data(), seq_len_this_time.data(), seq_lens_decoder.data(), seq_lens_encoder.data(), output_padding_offset.get_ptr()->data(), max_input_length, dim_embed, - elem_nums); + elem_nums, + bsz); } else { RebuildPaddingKernel <<>>( reinterpret_cast(out.data()), reinterpret_cast( const_cast(tmp_out.data())), - cum_offsets.data(), + cu_seqlens_q.data(), seq_len_this_time.data(), seq_lens_decoder.data(), seq_lens_encoder.data(), @@ -160,7 +164,7 @@ std::vector rebuild_padding( paddle::Tensor RebuildPaddingFunc( const paddle::Tensor &tmp_out, // [token_num, dim_embed] - const paddle::Tensor &cum_offsets, // [bsz, 1] + const paddle::Tensor &cu_seqlens_q, // [bsz+1, 1] const paddle::Tensor &seq_len_this_time, const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &seq_lens_encoder, @@ -170,7 +174,7 @@ paddle::Tensor RebuildPaddingFunc( case paddle::DataType::BFLOAT16: { return rebuild_padding( tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, @@ -180,7 +184,7 @@ paddle::Tensor RebuildPaddingFunc( case paddle::DataType::FLOAT16: { return rebuild_padding( tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, @@ -190,7 +194,7 @@ paddle::Tensor RebuildPaddingFunc( case paddle::DataType::FLOAT32: { return rebuild_padding( tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, @@ -208,14 +212,14 @@ paddle::Tensor RebuildPaddingFunc( std::vector RebuildPadding( const paddle::Tensor &tmp_out, // [token_num, dim_embed] - const paddle::Tensor &cum_offsets, // [bsz, 1] + const paddle::Tensor &cu_seqlens_q, // [bsz+1, 1] const paddle::Tensor &seq_len_this_time, const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &seq_lens_encoder, const paddle::optional &output_padding_offset, int max_input_length) { return {RebuildPaddingFunc(tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, @@ -225,7 +229,7 @@ std::vector RebuildPadding( std::vector> RebuildPaddingInferShape( const std::vector &tmp_out_shape, - const std::vector &cum_offsets_shape, + const std::vector &cu_seqlens_q_shape, const std::vector &seq_len_this_time_shape, const std::vector &seq_lens_decoder_shape, const std::vector &seq_lens_encoder_shape, @@ -235,14 +239,14 @@ std::vector> RebuildPaddingInferShape( if (output_padding_offset_shape) { return {{-1, dim_embed}}; } else { - int64_t bsz = cum_offsets_shape[0]; + int64_t bsz = cu_seqlens_q_shape[0] - 1; return {{bsz, dim_embed}}; } } std::vector RebuildPaddingInferDtype( const paddle::DataType &tmp_out_dtype, - const paddle::DataType &cum_offsets_dtype, + const paddle::DataType &cu_seqlens_q_dtype, const paddle::DataType &seq_len_this_time_dtype, const paddle::DataType &seq_lens_decoder_dtype, const paddle::DataType &seq_lens_encoder_dtype, @@ -252,7 +256,7 @@ std::vector RebuildPaddingInferDtype( PD_BUILD_STATIC_OP(rebuild_padding) .Inputs({"tmp_out", - "cum_offsets", + "cu_seqlens_q", "seq_len_this_time", "seq_lens_decoder", "seq_lens_encoder", diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_get_padding_offset.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_get_padding_offset.cu index 96186d761f..e37dacbf34 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_get_padding_offset.cu +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_get_padding_offset.cu @@ -106,7 +106,6 @@ std::vector SpeculateGetPaddingOffset( seq_length, max_draft_tokens); return {x_remove_padding, - cum_offsets_out, batch_id_per_token, cu_seqlens_q, cu_seqlens_k}; // , enc_token_num, dec_token_num}; @@ -121,7 +120,7 @@ std::vector> SpeculateGetPaddingOffsetInferShape( const std::vector& seq_lens_encoder_shape) { int64_t bsz = seq_len_shape[0]; int64_t seq_len = input_ids_shape[1]; - return {{-1}, {bsz}, {-1}, {bsz + 1}, {bsz + 1}}; + return {{-1}, {-1}, {bsz + 1}, {bsz + 1}}; } std::vector SpeculateGetPaddingOffsetInferDtype( @@ -132,7 +131,6 @@ std::vector SpeculateGetPaddingOffsetInferDtype( const paddle::DataType& seq_len_dtype, const paddle::DataType& seq_lens_encoder_dtype) { return {input_ids_dtype, - seq_len_dtype, seq_len_dtype, seq_len_dtype, seq_len_dtype}; @@ -141,12 +139,10 @@ std::vector SpeculateGetPaddingOffsetInferDtype( PD_BUILD_STATIC_OP(speculate_get_padding_offset) .Inputs({"input_ids", "draft_tokens", - "cum_offsets", "token_num", "seq_len", "seq_lens_encoder"}) .Outputs({"x_remove_padding", - "cum_offsets_out", "batch_id_per_token", "cu_seqlens_q", "cu_seqlens_k"}) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index b26746e74a..30b87d65b1 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -112,7 +112,6 @@ def pre_process( if speculative_decoding: ( ids_remove_padding, - cum_offsets, batch_id_per_token, cu_seqlens_q, cu_seqlens_k, @@ -142,14 +141,12 @@ def pre_process( else: ( ids_remove_padding, - cum_offsets, batch_id_per_token, cu_seqlens_q, cu_seqlens_k, ) = get_padding_offset(input_ids, cum_offsets_now, token_num, seq_lens_this_time) return ( ids_remove_padding, - cum_offsets, batch_id_per_token, cu_seqlens_q, cu_seqlens_k, @@ -523,7 +520,7 @@ def step_cuda( def rebuild_padding( tmp_out: paddle.Tensor, - cum_offsets: paddle.Tensor, + cu_seqlens_q: paddle.Tensor, seq_len_this_time: paddle.Tensor, seq_lens_decoder: paddle.Tensor, seq_lens_encoder: paddle.Tensor, @@ -539,7 +536,7 @@ def rebuild_padding( hidden_states = rebuild_padding( tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, @@ -551,7 +548,7 @@ def rebuild_padding( hidden_states = rebuild_padding( tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, @@ -563,7 +560,7 @@ def rebuild_padding( hidden_states = rebuild_padding( tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, @@ -575,7 +572,7 @@ def rebuild_padding( hidden_states = rebuild_padding( tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, @@ -587,7 +584,7 @@ def rebuild_padding( hidden_states = rebuild_padding_cpu( tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, @@ -599,7 +596,7 @@ def rebuild_padding( hidden_states = rebuild_padding( tmp_out, - cum_offsets, + cu_seqlens_q, seq_len_this_time, seq_lens_decoder, seq_lens_encoder, diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index b6386c601b..42f76dd8a1 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -274,7 +274,6 @@ def _init_model_inputs(self): self.model_inputs["not_need_stop"] = paddle.to_tensor([False], dtype="bool", place="cpu") self.model_inputs["pre_ids"] = paddle.clone(self.main_model_inputs["pre_ids"]) self.model_inputs["ids_remove_padding"] = paddle.clone(self.main_model_inputs["ids_remove_padding"]) - self.model_inputs["cum_offsets"] = paddle.clone(self.main_model_inputs["cum_offsets"]) self.model_inputs["batch_id_per_token"] = paddle.clone(self.main_model_inputs["batch_id_per_token"]) self.model_inputs["cu_seqlens_q"] = paddle.clone(self.main_model_inputs["cu_seqlens_q"]) self.model_inputs["cu_seqlens_k"] = paddle.clone(self.main_model_inputs["cu_seqlens_k"]) @@ -530,7 +529,6 @@ def _propose(self, target_hidden_states): # Remove padding ( ids_remove_padding, - cum_offsets, batch_id_per_token, cu_seqlens_q, cu_seqlens_k, @@ -546,7 +544,6 @@ def _propose(self, target_hidden_states): ) # Initialize forward meta data self.model_inputs["ids_remove_padding"].copy_(ids_remove_padding, False) - self.model_inputs["cum_offsets"].copy_(cum_offsets, False) self.model_inputs["batch_id_per_token"].copy_(batch_id_per_token, False) self.model_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False) self.model_inputs["cu_seqlens_k"].copy_(cu_seqlens_k, False) @@ -581,7 +578,7 @@ def _propose(self, target_hidden_states): hidden_states = rebuild_padding( model_output, - self.model_inputs["cum_offsets"], + self.model_inputs["cu_seqlens_q"], self.model_inputs["seq_lens_this_time"], self.model_inputs["seq_lens_decoder"], self.model_inputs["seq_lens_encoder"], diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index 6065b496a1..d1f8f2c689 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -423,7 +423,7 @@ def _init_share_inputs(self, max_num_seqs: int): 0, dtype="int64", ) - self.share_inputs["cum_offsets"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["batch_id_per_token"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") @@ -522,7 +522,6 @@ def _prepare_inputs(self) -> None: ) self.share_inputs["ids_remove_padding"].copy_(ids_remove_padding, False) - self.share_inputs["cum_offsets"].copy_(cum_offsets, False) self.share_inputs["batch_id_per_token"].copy_(batch_id_per_token, False) self.share_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False) self.share_inputs["cu_seqlens_k"].copy_(cu_seqlens_k, False) @@ -742,7 +741,7 @@ def _dummy_run( hidden_states = rebuild_padding( model_output, - self.share_inputs["cum_offsets"], + self.share_inputs["cu_seqlens_q"], self.share_inputs["seq_lens_this_time"], self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_encoder"], @@ -967,7 +966,7 @@ class at the server level, which is too granular for ModelRunner. hidden_states = rebuild_padding( model_output, - self.share_inputs["cum_offsets"], + self.share_inputs["cu_seqlens_q"], self.share_inputs["seq_lens_this_time"], self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_encoder"], diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 66908e9168..2dfe1021c6 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -583,7 +583,6 @@ def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decod self.share_inputs["min_dec_len"][idx : idx + 1] = max_dec_len self.share_inputs["stop_flags"][idx : idx + 1] = False self.share_inputs["temperature"][idx : idx + 1] = 1 - self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1] self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = input_length @@ -680,7 +679,6 @@ def _init_share_inputs(self, max_num_seqs: int): 0, dtype="int64", ) - self.share_inputs["cum_offsets"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["batch_id_per_token"] = paddle.full( [max_num_seqs * self.parallel_config.max_model_len, 1], 0, dtype="int32" ) @@ -803,7 +801,6 @@ def _prepare_inputs(self) -> None: # Remove padding ( ids_remove_padding, - cum_offsets, batch_id_per_token, cu_seqlens_q, cu_seqlens_k, @@ -819,7 +816,6 @@ def _prepare_inputs(self) -> None: ) self.share_inputs["ids_remove_padding"].copy_(ids_remove_padding, False) - self.share_inputs["cum_offsets"].copy_(cum_offsets, False) self.share_inputs["batch_id_per_token"].copy_(batch_id_per_token, False) self.share_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False) self.share_inputs["cu_seqlens_k"].copy_(cu_seqlens_k, False) @@ -965,7 +961,6 @@ def initialize_kv_cache(self, profile: bool = False) -> None: cache_kvs_list.append(value_cache) self.share_inputs["caches"] = cache_kvs_list - else: for i in range(self.model_config.num_hidden_layers): cache_kvs[f"key_caches_{i}"] = paddle.full( @@ -1071,7 +1066,7 @@ def _dummy_run( hidden_states = rebuild_padding( model_output, - self.share_inputs["cum_offsets"], + self.share_inputs["cu_seqlens_q"], self.share_inputs["seq_lens_this_time"], self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_encoder"], @@ -1336,7 +1331,7 @@ class at the server level, which is too granular for ModelRunner. ) hidden_states = rebuild_padding( model_output, - self.share_inputs["cum_offsets"], + self.share_inputs["cu_seqlens_q"], self.share_inputs["seq_lens_this_time"], self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_encoder"], @@ -1436,6 +1431,7 @@ class at the server level, which is too granular for ModelRunner. # 7. Updata 'infer_seed' and step_cuda() self.share_inputs["infer_seed"].add_(self.infer_seed_increment) self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED + if not envs.ENABLE_V1_KVCACHE_SCHEDULER: step_cuda( self.share_inputs,