Skip to content

Commit 6a69e04

Browse files
committed
add qwen_vl_processor unittest
1 parent 3c78d5e commit 6a69e04

File tree

3 files changed

+244
-35
lines changed

3 files changed

+244
-35
lines changed

fastdeploy/input/qwen_mm_processor/process.py

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,21 @@ def __init__(
101101
}
102102

103103
def _pack_outputs(self, outputs):
104+
"""
105+
Pack and convert all output data into numpy arrays with appropriate types.
106+
107+
Args:
108+
outputs (dict): Dictionary containing model outputs with keys:
109+
- images: List of visual features
110+
- grid_thw: List of spatial dimensions
111+
- image_type_ids: List of content type indicators
112+
- input_ids: List of token IDs
113+
- token_type_ids: List of type identifiers
114+
- position_ids: List of position embeddings
115+
116+
Returns:
117+
dict: Processed outputs with all values converted to numpy arrays
118+
"""
104119
# Process visual outputs - stack if exists or set to None if empty
105120
if not outputs["images"]:
106121
outputs["images"] = None # No images case
@@ -188,6 +203,21 @@ def text2ids(self, text, images=None, videos=None):
188203

189204
return self._pack_outputs(outputs)
190205

206+
def _parse_chat_messages(self, request):
207+
"""
208+
Parse chat messages from request into structured format.
209+
210+
Args:
211+
request (dict): Input request containing chat messages
212+
213+
Returns:
214+
list: Parsed list of message dictionaries with:
215+
- role (str): Message role (user/assistant)
216+
- content (str): Message text content
217+
- images (list, optional): List of image data if present
218+
"""
219+
return parse_chat_messages(request.get("messages"))
220+
191221
def request2ids(
192222
self, request: Dict[str, Any], tgts: List[str] = None
193223
) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
@@ -218,7 +248,7 @@ def request2ids(
218248
}
219249

220250
# Parse and validate chat messages
221-
messages = parse_chat_messages(request.get("messages"))
251+
messages = self._parse_chat_messages(request)
222252
image_message_list = [] # Store visual content messages
223253

224254
for msg in messages:
@@ -234,11 +264,14 @@ def request2ids(
234264
for item in content_items:
235265
if isinstance(item, dict) and item.get("type") in ["image", "video"]:
236266
image_message_list.append(item)
267+
268+
raw_messages = request["messages"]
237269
request["messages"] = messages
238270

239271
prompt_token_ids = self.apply_chat_template(request)
240272
if len(prompt_token_ids) == 0:
241273
raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
274+
request["messages"] = raw_messages
242275

243276
vision_start_index = 0
244277
vision_message_index = 0
@@ -376,17 +409,17 @@ def _compute_vision_positions(
376409
self, start_pos: int, t: int, h: int, w: int, second_per_grid_t: float
377410
) -> np.ndarray:
378411
"""
379-
Generate 3D positional embeddings for visual content.
412+
Generate 3D position IDs for visual inputs.
380413
381414
Args:
382-
start_pos: Starting position index
383-
t: Temporal dimension (frames)
415+
start_pos: Base position in sequence
416+
t: Temporal patches (1 for images)
384417
h: Height in patches
385418
w: Width in patches
386-
second_per_grid_t: Seconds per temporal grid
419+
second_per_grid_t: Time per temporal patch
387420
388421
Returns:
389-
numpy.ndarray: 3D position IDs shaped (3, t*h*w)
422+
np.ndarray: Position IDs for [t,h,w] dimensions
390423
"""
391424
h //= self.spatial_conv_size
392425
w //= self.spatial_conv_size
@@ -478,6 +511,7 @@ def apply_chat_template(self, request):
478511
add_generation_prompt=request.get("add_generation_prompt", True),
479512
)
480513
prompt_token_str = raw_prompt.replace(self.image_token, "").replace(self.video_token, "")
514+
request["text_after_process"] = raw_prompt
481515

482516
tokens = self.tokenizer.tokenize(prompt_token_str)
483517
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)

fastdeploy/input/qwen_vl_processor.py

Lines changed: 45 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -24,19 +24,18 @@
2424

2525
class QwenVLProcessor(TextProcessor):
2626
"""
27-
Processor for Qwen Vision-Language models that handles multimodal inputs.
27+
Qwen Vision-Language processor for handling multimodal inputs.
2828
29-
Inherits from ErnieProcessor and extends functionality for:
29+
This processor extends TextProcessor to support:
3030
- Image and video processing
31-
- Multimodal request handling
32-
- Generation configuration
31+
- Multimodal feature extraction
32+
- Tokenization and position encoding
33+
- Request processing and model input generation
3334
3435
Attributes:
35-
processor: Underlying DataProcessor instance
36-
tokenizer: Text tokenizer
37-
generation_config: Model generation configuration
38-
eos_token_ids: End-of-sequence token IDs
39-
limit_mm_per_prompt: Limits for multimodal inputs
36+
processor (DataProcessor): Underlying data processor instance
37+
tokenizer: Text tokenizer instance
38+
limit_mm_per_prompt (dict): Limits for multimodal inputs per prompt
4039
"""
4140

4241
def __init__(
@@ -49,14 +48,15 @@ def __init__(
4948
tool_parser_obj=None,
5049
):
5150
"""
52-
Initialize QwenVLProcessor.
51+
Initialize QwenVLProcessor instance.
5352
5453
Args:
55-
config: Model configuration
56-
model_name_or_path: Path to pretrained model
57-
limit_mm_per_prompt: Limits for multimodal inputs per prompt
58-
mm_processor_kwargs: Additional kwargs for multimodal processor
59-
reasoning_parser_obj: Optional reasoning parser
54+
config: Model configuration object
55+
model_name_or_path (str): Pretrained model name or path
56+
limit_mm_per_prompt (dict, optional): Limits for multimodal inputs
57+
mm_processor_kwargs (dict, optional): Multimodal processor arguments
58+
reasoning_parser_obj: Reasoning parser instance
59+
tool_parser_obj: Tool parser instance
6060
"""
6161
super().__init__(model_name_or_path, reasoning_parser_obj, tool_parser_obj)
6262

@@ -73,12 +73,12 @@ def __init__(
7373

7474
def process_request(self, request, max_model_len=None, **kwargs):
7575
"""
76-
Process incoming request into model inputs.
76+
Process incoming request and generate model inputs.
7777
7878
Args:
7979
request: Input request object
80-
max_model_len: Maximum model context length
81-
**kwargs: Additional processing arguments
80+
max_model_len (int, optional): Maximum context length
81+
**kwargs: Additional processing parameters
8282
8383
Returns:
8484
Request: Processed request with model inputs
@@ -92,16 +92,16 @@ def process_request(self, request, max_model_len=None, **kwargs):
9292

9393
def _parse_processor_kwargs(self, kwargs):
9494
"""
95-
Parse and validate multimodal processor kwargs.
95+
Parse and validate multimodal processor arguments.
9696
9797
Args:
98-
kwargs: Input kwargs dictionary
98+
kwargs (dict): Processor configuration arguments
9999
100100
Returns:
101-
dict: Validated processor kwargs
101+
dict: Validated processor arguments
102102
103103
Raises:
104-
ValueError: If kwargs format is invalid
104+
ValueError: If arguments format is invalid
105105
"""
106106
if not kwargs:
107107
return {}
@@ -134,7 +134,7 @@ def _parse_limits(self, limits):
134134
Parse and validate multimodal input limits.
135135
136136
Args:
137-
limits: Input limits dictionary
137+
limits (dict): Input limits configuration
138138
139139
Returns:
140140
dict: Validated limits with defaults
@@ -161,7 +161,7 @@ def _check_mm_limits(self, item):
161161
Validate multimodal inputs against configured limits.
162162
163163
Args:
164-
item: Input request item to check
164+
item: Input request item to validate
165165
166166
Raises:
167167
ValueError: If input exceeds configured limits
@@ -176,9 +176,9 @@ def _check_mm_limits(self, item):
176176
for message in item:
177177
if isinstance(message.get("content"), list):
178178
for part in message["content"]:
179-
if part.get("type") == "image":
179+
if part.get("type") in ["image_url", "image"]:
180180
mm_data["image"].append(part)
181-
elif part.get("type") == "video":
181+
elif part.get("type") in ["video_url", "video"]:
182182
mm_data["video"].append(part)
183183

184184
for modality, data in mm_data.items():
@@ -192,8 +192,8 @@ def process_request_dict(self, request, max_model_len=None):
192192
Process request dictionary into model inputs.
193193
194194
Args:
195-
request: Input request dictionary
196-
max_model_len: Maximum model context length
195+
request (dict): Input request dictionary
196+
max_model_len (int, optional): Maximum context length
197197
198198
Returns:
199199
dict: Processed request with model inputs
@@ -253,6 +253,13 @@ def process_request_dict(self, request, max_model_len=None):
253253
return request
254254

255255
def append_generated_tokens(self, outputs, generated_token_ids):
256+
"""
257+
Append generated tokens to existing outputs.
258+
259+
Args:
260+
outputs: Current model outputs
261+
generated_token_ids: Generated tokens to append
262+
"""
256263
out = {"input_ids": [], "token_type_ids": [], "position_ids": [], "cur_position": outputs["cur_position"]}
257264
self.processor._add_text(generated_token_ids, out)
258265

@@ -263,11 +270,20 @@ def append_generated_tokens(self, outputs, generated_token_ids):
263270
[outputs["token_type_ids"], np.array(out["token_type_ids"], dtype=np.int64)], axis=0
264271
)
265272
outputs["position_ids"] = np.concatenate(
266-
[outputs["position_ids"], out["position_ids"]], axis=1, dtype=np.int64
273+
[outputs["position_ids"], out["position_ids"][0]], axis=1, dtype=np.int64
267274
)
268275
outputs["cur_position"] = out["cur_position"]
269276

270277
def pack_outputs(self, outputs):
278+
"""
279+
Prepare final output dictionary for model.
280+
281+
Args:
282+
outputs: Intermediate processing outputs
283+
284+
Returns:
285+
dict: Packed output dictionary with all required fields
286+
"""
271287
outputs["image_patch_id"] = self.processor.image_token_id
272288
outputs["video_patch_id"] = self.processor.video_token_id
273289
outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)

0 commit comments

Comments
 (0)