1
+ """
2
+ # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+
1
17
import unittest
2
18
from unittest .mock import MagicMock , patch
3
19
10
26
11
27
12
28
def mock_pil_image (height , width ):
29
+ """Generate mock random RGB image
30
+
31
+ Args:
32
+ height: Image height in pixels
33
+ width: Image width in pixels
34
+
35
+ Returns:
36
+ PIL.Image object with random RGB data
37
+ """
13
38
rgb_image = np .random .randint (0 , 256 , (height , width , 3 ), dtype = np .uint8 )
14
39
return Image .fromarray (rgb_image )
15
40
16
41
17
42
def mock_parse_chat_messages ():
43
+ """Generate mock chat messages with image, video and text content
44
+
45
+ Returns:
46
+ List of chat message dictionaries containing:
47
+ - Mock image data (480x640 pixels)
48
+ - Mock video data (dummy bytes)
49
+ - Sample text prompt
50
+ """
18
51
messages = [
19
52
{
20
53
"role" : "user" ,
@@ -37,6 +70,17 @@ def mock_parse_chat_messages():
37
70
38
71
39
72
def mock_video_frames (num_frames , height , width ):
73
+ """Generate mock video frames with random pixel data
74
+
75
+ Args:
76
+ num_frames: Number of frames to generate
77
+ height: Frame height in pixels
78
+ width: Frame width in pixels
79
+
80
+ Returns:
81
+ Numpy array of shape (num_frames, height, width, 3)
82
+ containing random RGB frames
83
+ """
40
84
frames = []
41
85
for i in range (num_frames ):
42
86
frame = np .random .randint (0 , 256 , (height , width , 3 ), dtype = np .uint8 )
@@ -45,6 +89,16 @@ def mock_video_frames(num_frames, height, width):
45
89
46
90
47
91
def mock_load_and_process_video ():
92
+ """Mock video loading and processing
93
+
94
+ Returns:
95
+ Tuple containing:
96
+ - frames: 3 mock video frames (480x640 resolution)
97
+ - meta: Dictionary with mock video metadata:
98
+ * fps: 1
99
+ * duration: 3 seconds
100
+ * num_of_frame: 3
101
+ """
48
102
frames = mock_video_frames (num_frames = 3 , height = 480 , width = 640 )
49
103
meta = {
50
104
"fps" : 1 ,
@@ -55,8 +109,14 @@ def mock_load_and_process_video():
55
109
56
110
57
111
class TestQwenVLProcessor (unittest .TestCase ):
112
+ """Unit tests for Qwen Vision-Language Processor functionality"""
58
113
59
114
def setUp (self ):
115
+ """Initialize test case with:
116
+ - Mock configuration
117
+ - Patched message parsing and video processing methods
118
+ - QwenVLProcessor instance with test parameters
119
+ """
60
120
config = MagicMock ()
61
121
config .vision_config .tokens_per_second = 2
62
122
@@ -76,7 +136,7 @@ def setUp(self):
76
136
}
77
137
limit_mm_per_prompt = {"image" : 1 , "video" : 1 , "audio" : 1 }
78
138
79
- model_name_or_path = "/workspace/Fastdeploy/test/ ModelData/Qwen2.5-VL-7B-Instruct"
139
+ model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct"
80
140
self .processor = QwenVLProcessor (
81
141
config = config ,
82
142
model_name_or_path = model_name_or_path ,
@@ -87,10 +147,19 @@ def setUp(self):
87
147
)
88
148
89
149
def tearDown (self ) -> None :
150
+ """Clean up test case by stopping all mock patches"""
90
151
self .patcher_parse_chat_messages .stop ()
91
152
self .patcher_load_and_process_video .stop ()
92
153
93
154
def test_process_request (self ):
155
+ """Test processing of Request object with multimodal input
156
+
157
+ Validates:
158
+ 1. Token ID lengths match position_ids and token_type_ids shapes
159
+ 2. Image processing produces expected output dimensions
160
+ 3. Video processing produces expected output dimensions
161
+ 4. Correct counts for images (1) and videos (1)
162
+ """
94
163
prompt = {
95
164
"request_id" : "123" ,
96
165
"messages" : [
@@ -121,6 +190,14 @@ def test_process_request(self):
121
190
self .assertEqual (result .multimodal_inputs ["video_cnt" ], 1 )
122
191
123
192
def test_process_request_dict (self ):
193
+ """Test processing of dictionary-format request with multimodal input
194
+
195
+ Validates:
196
+ 1. Token ID lengths match position_ids and token_type_ids shapes
197
+ 2. Image processing produces expected output dimensions
198
+ 3. Video processing produces expected output dimensions
199
+ 4. Correct counts for images (1) and videos (1)
200
+ """
124
201
num_generated_token_ids = 10
125
202
request = {
126
203
"metadata" : {
0 commit comments