Skip to content

Commit 66d6be0

Browse files
authored
Bug fix: use correct mm_items in embed_mm_inputs (sgl-project#8893)
1 parent 1c1f8a1 commit 66d6be0

File tree

3 files changed

+62
-1
lines changed

3 files changed

+62
-1
lines changed

python/sglang/srt/managers/mm_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -560,7 +560,7 @@ def embed_mm_inputs(
560560
]
561561
items_size[i + 1] = len(mm_items)
562562
items_offsets.append(
563-
flatten_nested_list([item.offsets for item in mm_inputs.mm_items])
563+
flatten_nested_list([item.offsets for item in mm_items])
564564
)
565565
items_size = torch.cumsum(items_size, dim=0).tolist()
566566

test/srt/test_vision_openai_server_b.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,9 @@ def test_audio_chat_completion(self):
189189
# This _test_audio_ambient_completion test is way too complicated to pass for a small LLM
190190
# self._test_audio_ambient_completion()
191191

192+
def _test_mixed_image_audio_chat_completion(self):
193+
self._test_mixed_image_audio_chat_completion()
194+
192195

193196
class TestQwen2AudioServer(TestOpenAIVisionServer):
194197
@classmethod

test/srt/test_vision_openai_server_common.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,64 @@ def test_multi_images_chat_completion(self):
213213
assert response.usage.completion_tokens > 0
214214
assert response.usage.total_tokens > 0
215215

216+
def _test_mixed_image_audio_chat_completion(self):
217+
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
218+
219+
response = client.chat.completions.create(
220+
model="default",
221+
messages=[
222+
{
223+
"role": "user",
224+
"content": [
225+
{
226+
"type": "image_url",
227+
"image_url": {"url": IMAGE_MAN_IRONING_URL},
228+
},
229+
{
230+
"type": "audio_url",
231+
"audio_url": {"url": AUDIO_TRUMP_SPEECH_URL},
232+
},
233+
{
234+
"type": "text",
235+
"text": "Please describe the image in one sentence, and then write down the audio transcription in English.",
236+
},
237+
],
238+
},
239+
],
240+
temperature=0,
241+
**(self.get_vision_request_kwargs()),
242+
)
243+
244+
assert response.choices[0].message.role == "assistant"
245+
text = response.choices[0].message.content
246+
assert isinstance(text, str)
247+
print("-" * 30)
248+
print(f"Mixed image & audio response:\n{text}")
249+
print("-" * 30)
250+
assert (
251+
"man" in text
252+
or "cab" in text
253+
or "SUV" in text
254+
or "taxi" in text
255+
or "car" in text
256+
), f"text: {text}, should contain man, cab, SUV, taxi or car"
257+
check_list = [
258+
"thank you",
259+
"it's a privilege to be here",
260+
"leader",
261+
"science",
262+
"art",
263+
]
264+
for check_word in check_list:
265+
assert (
266+
check_word in text
267+
), f"text: |{text}| should contain |{check_word}|"
268+
assert response.id
269+
assert response.created
270+
assert response.usage.prompt_tokens > 0
271+
assert response.usage.completion_tokens > 0
272+
assert response.usage.total_tokens > 0
273+
216274
def prepare_video_images_messages(self, video_path):
217275
# the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
218276
# the size of the video embeds differs from the `modality` argument when preprocessed

0 commit comments

Comments
 (0)