@@ -213,6 +213,64 @@ def test_multi_images_chat_completion(self):
213
213
assert response .usage .completion_tokens > 0
214
214
assert response .usage .total_tokens > 0
215
215
216
+ def _test_mixed_image_audio_chat_completion (self ):
217
+ client = openai .Client (api_key = self .api_key , base_url = self .base_url )
218
+
219
+ response = client .chat .completions .create (
220
+ model = "default" ,
221
+ messages = [
222
+ {
223
+ "role" : "user" ,
224
+ "content" : [
225
+ {
226
+ "type" : "image_url" ,
227
+ "image_url" : {"url" : IMAGE_MAN_IRONING_URL },
228
+ },
229
+ {
230
+ "type" : "audio_url" ,
231
+ "audio_url" : {"url" : AUDIO_TRUMP_SPEECH_URL },
232
+ },
233
+ {
234
+ "type" : "text" ,
235
+ "text" : "Please describe the image in one sentence, and then write down the audio transcription in English." ,
236
+ },
237
+ ],
238
+ },
239
+ ],
240
+ temperature = 0 ,
241
+ ** (self .get_vision_request_kwargs ()),
242
+ )
243
+
244
+ assert response .choices [0 ].message .role == "assistant"
245
+ text = response .choices [0 ].message .content
246
+ assert isinstance (text , str )
247
+ print ("-" * 30 )
248
+ print (f"Mixed image & audio response:\n { text } " )
249
+ print ("-" * 30 )
250
+ assert (
251
+ "man" in text
252
+ or "cab" in text
253
+ or "SUV" in text
254
+ or "taxi" in text
255
+ or "car" in text
256
+ ), f"text: { text } , should contain man, cab, SUV, taxi or car"
257
+ check_list = [
258
+ "thank you" ,
259
+ "it's a privilege to be here" ,
260
+ "leader" ,
261
+ "science" ,
262
+ "art" ,
263
+ ]
264
+ for check_word in check_list :
265
+ assert (
266
+ check_word in text
267
+ ), f"text: |{ text } | should contain |{ check_word } |"
268
+ assert response .id
269
+ assert response .created
270
+ assert response .usage .prompt_tokens > 0
271
+ assert response .usage .completion_tokens > 0
272
+ assert response .usage .total_tokens > 0
273
+
216
274
def prepare_video_images_messages (self , video_path ):
217
275
# the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
218
276
# the size of the video embeds differs from the `modality` argument when preprocessed
0 commit comments