24
24
25
25
class QwenVLProcessor (TextProcessor ):
26
26
"""
27
- Processor for Qwen Vision-Language models that handles multimodal inputs.
27
+ Qwen Vision-Language processor for handling multimodal inputs.
28
28
29
- Inherits from ErnieProcessor and extends functionality for :
29
+ This processor extends TextProcessor to support :
30
30
- Image and video processing
31
- - Multimodal request handling
32
- - Generation configuration
31
+ - Multimodal feature extraction
32
+ - Tokenization and position encoding
33
+ - Request processing and model input generation
33
34
34
35
Attributes:
35
- processor: Underlying DataProcessor instance
36
- tokenizer: Text tokenizer
37
- generation_config: Model generation configuration
38
- eos_token_ids: End-of-sequence token IDs
39
- limit_mm_per_prompt: Limits for multimodal inputs
36
+ processor (DataProcessor): Underlying data processor instance
37
+ tokenizer: Text tokenizer instance
38
+ limit_mm_per_prompt (dict): Limits for multimodal inputs per prompt
40
39
"""
41
40
42
41
def __init__ (
@@ -49,14 +48,15 @@ def __init__(
49
48
tool_parser_obj = None ,
50
49
):
51
50
"""
52
- Initialize QwenVLProcessor.
51
+ Initialize QwenVLProcessor instance .
53
52
54
53
Args:
55
- config: Model configuration
56
- model_name_or_path: Path to pretrained model
57
- limit_mm_per_prompt: Limits for multimodal inputs per prompt
58
- mm_processor_kwargs: Additional kwargs for multimodal processor
59
- reasoning_parser_obj: Optional reasoning parser
54
+ config: Model configuration object
55
+ model_name_or_path (str): Pretrained model name or path
56
+ limit_mm_per_prompt (dict, optional): Limits for multimodal inputs
57
+ mm_processor_kwargs (dict, optional): Multimodal processor arguments
58
+ reasoning_parser_obj: Reasoning parser instance
59
+ tool_parser_obj: Tool parser instance
60
60
"""
61
61
super ().__init__ (model_name_or_path , reasoning_parser_obj , tool_parser_obj )
62
62
@@ -73,12 +73,12 @@ def __init__(
73
73
74
74
def process_request (self , request , max_model_len = None , ** kwargs ):
75
75
"""
76
- Process incoming request into model inputs.
76
+ Process incoming request and generate model inputs.
77
77
78
78
Args:
79
79
request: Input request object
80
- max_model_len: Maximum model context length
81
- **kwargs: Additional processing arguments
80
+ max_model_len (int, optional) : Maximum context length
81
+ **kwargs: Additional processing parameters
82
82
83
83
Returns:
84
84
Request: Processed request with model inputs
@@ -92,16 +92,16 @@ def process_request(self, request, max_model_len=None, **kwargs):
92
92
93
93
def _parse_processor_kwargs (self , kwargs ):
94
94
"""
95
- Parse and validate multimodal processor kwargs .
95
+ Parse and validate multimodal processor arguments .
96
96
97
97
Args:
98
- kwargs: Input kwargs dictionary
98
+ kwargs (dict): Processor configuration arguments
99
99
100
100
Returns:
101
- dict: Validated processor kwargs
101
+ dict: Validated processor arguments
102
102
103
103
Raises:
104
- ValueError: If kwargs format is invalid
104
+ ValueError: If arguments format is invalid
105
105
"""
106
106
if not kwargs :
107
107
return {}
@@ -134,7 +134,7 @@ def _parse_limits(self, limits):
134
134
Parse and validate multimodal input limits.
135
135
136
136
Args:
137
- limits: Input limits dictionary
137
+ limits (dict) : Input limits configuration
138
138
139
139
Returns:
140
140
dict: Validated limits with defaults
@@ -161,7 +161,7 @@ def _check_mm_limits(self, item):
161
161
Validate multimodal inputs against configured limits.
162
162
163
163
Args:
164
- item: Input request item to check
164
+ item: Input request item to validate
165
165
166
166
Raises:
167
167
ValueError: If input exceeds configured limits
@@ -176,9 +176,9 @@ def _check_mm_limits(self, item):
176
176
for message in item :
177
177
if isinstance (message .get ("content" ), list ):
178
178
for part in message ["content" ]:
179
- if part .get ("type" ) == " image" :
179
+ if part .get ("type" ) in [ "image_url" , " image"] :
180
180
mm_data ["image" ].append (part )
181
- elif part .get ("type" ) == " video" :
181
+ elif part .get ("type" ) in [ "video_url" , " video"] :
182
182
mm_data ["video" ].append (part )
183
183
184
184
for modality , data in mm_data .items ():
@@ -192,8 +192,8 @@ def process_request_dict(self, request, max_model_len=None):
192
192
Process request dictionary into model inputs.
193
193
194
194
Args:
195
- request: Input request dictionary
196
- max_model_len: Maximum model context length
195
+ request (dict) : Input request dictionary
196
+ max_model_len (int, optional) : Maximum context length
197
197
198
198
Returns:
199
199
dict: Processed request with model inputs
@@ -253,6 +253,13 @@ def process_request_dict(self, request, max_model_len=None):
253
253
return request
254
254
255
255
def append_generated_tokens (self , outputs , generated_token_ids ):
256
+ """
257
+ Append generated tokens to existing outputs.
258
+
259
+ Args:
260
+ outputs: Current model outputs
261
+ generated_token_ids: Generated tokens to append
262
+ """
256
263
out = {"input_ids" : [], "token_type_ids" : [], "position_ids" : [], "cur_position" : outputs ["cur_position" ]}
257
264
self .processor ._add_text (generated_token_ids , out )
258
265
@@ -263,11 +270,20 @@ def append_generated_tokens(self, outputs, generated_token_ids):
263
270
[outputs ["token_type_ids" ], np .array (out ["token_type_ids" ], dtype = np .int64 )], axis = 0
264
271
)
265
272
outputs ["position_ids" ] = np .concatenate (
266
- [outputs ["position_ids" ], out ["position_ids" ]], axis = 1 , dtype = np .int64
273
+ [outputs ["position_ids" ], out ["position_ids" ][ 0 ] ], axis = 1 , dtype = np .int64
267
274
)
268
275
outputs ["cur_position" ] = out ["cur_position" ]
269
276
270
277
def pack_outputs (self , outputs ):
278
+ """
279
+ Prepare final output dictionary for model.
280
+
281
+ Args:
282
+ outputs: Intermediate processing outputs
283
+
284
+ Returns:
285
+ dict: Packed output dictionary with all required fields
286
+ """
271
287
outputs ["image_patch_id" ] = self .processor .image_token_id
272
288
outputs ["video_patch_id" ] = self .processor .video_token_id
273
289
outputs ["position_ids" ] = outputs ["position_ids" ].transpose (1 , 0 )
0 commit comments