@@ -178,15 +178,18 @@ def _generate_bge_vl_mllm_embeddings(self, inputs):
178
178
# Process text-only inputs
179
179
if text_inputs :
180
180
try :
181
- for text in text_inputs :
181
+ with torch .no_grad ():
182
+ self .model .set_processor (self .model_abs_path )
182
183
candidate_inputs = self .model .data_process (
183
- text = text ,
184
+ text = text_inputs ,
184
185
q_or_c = "c"
185
186
)
186
- with torch .no_grad ():
187
- text_emb = self .model (** candidate_inputs , output_hidden_states = True )[:, - 1 , :]
188
- text_emb = torch .nn .functional .normalize (text_emb , dim = - 1 )
189
- all_embeddings .append (text_emb .cpu ().tolist ()[0 ])
187
+ text_emb = self .model (** candidate_inputs , output_hidden_states = True )[:, - 1 , :]
188
+ text_emb = torch .nn .functional .normalize (text_emb , dim = - 1 )
189
+ if hasattr (text_emb , 'tolist' ):
190
+ all_embeddings .extend (text_emb .tolist ())
191
+ else :
192
+ all_embeddings .extend (text_emb )
190
193
except Exception as e :
191
194
logger .error (f"Failed to encode text inputs with MLLM: { e } " )
192
195
raise ValueError (f"BGE-VL-MLLM text encoding failed: { e } " )
@@ -212,22 +215,26 @@ def _generate_bge_vl_mllm_embeddings(self, inputs):
212
215
213
216
# Process multimodal inputs (text + image)
214
217
if multimodal_inputs :
215
- for text , bytesio_image in multimodal_inputs :
216
- try :
217
- # Convert BytesIO back to PIL Image for MLLM model
218
- pil_image = Image .open (bytesio_image )
219
- candidate_inputs = self .model .data_process (
220
- text = text ,
221
- images = [pil_image ],
222
- q_or_c = "c"
223
- )
224
- with torch .no_grad ():
225
- multimodal_emb = self .model (** candidate_inputs , output_hidden_states = True )[:, - 1 , :]
226
- multimodal_emb = torch .nn .functional .normalize (multimodal_emb , dim = - 1 )
227
- all_embeddings .append (multimodal_emb .cpu ().tolist ()[0 ])
228
- except Exception as e :
229
- logger .error (f"Failed to encode multimodal input with MLLM: { e } " )
230
- raise ValueError (f"BGE-VL-MLLM multimodal encoding failed: { e } " )
218
+ with torch .no_grad ():
219
+ self .model .set_processor (self .model_abs_path )
220
+ for text , bytesio_image in multimodal_inputs :
221
+ try :
222
+ # Convert BytesIO back to PIL Image for MLLM model
223
+ candidate_inputs = self .model .data_process (
224
+ text = [text ],
225
+ images = [bytesio_image ],
226
+ q_or_c = "c"
227
+ )
228
+ with torch .no_grad ():
229
+ multimodal_emb = self .model (** candidate_inputs , output_hidden_states = True )[:, - 1 , :]
230
+ multimodal_emb = torch .nn .functional .normalize (multimodal_emb , dim = - 1 )
231
+ if hasattr (multimodal_emb , 'tolist' ):
232
+ all_embeddings .extend (multimodal_emb .tolist ())
233
+ else :
234
+ all_embeddings .extend (multimodal_emb )
235
+ except Exception as e :
236
+ logger .error (f"Failed to encode multimodal input with MLLM: { e } " )
237
+ raise ValueError (f"BGE-VL-MLLM multimodal encoding failed: { e } " )
231
238
232
239
return all_embeddings
233
240
0 commit comments