Skip to content

Commit aa095b7

Browse files
committed
feat: add huggingface_embedding_engine_447, fix text and multimodal input
1 parent 5bf0922 commit aa095b7

File tree

4 files changed

+33
-25
lines changed

4 files changed

+33
-25
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,3 +134,4 @@ emd_models/
134134
**artifacts
135135
src/pipeline/emd
136136
*.log
137+
.venv-vl/*

src/emd/models/embeddings/bge_vl.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from .. import Model
2-
from ..engines import huggingface_embedding_engine449, huggingface_embedding_engine447
2+
from ..engines import huggingface_embedding_engine449, huggingface_embedding_engine_447
33
from ..services import sagemaker_service, local_service, ecs_service
44
from ..frameworks import fastapi_framework
55
from ..instances import (
@@ -77,7 +77,7 @@
7777
Model.register(
7878
dict(
7979
model_id="bge-vl-mllm-s1",
80-
supported_engines=[huggingface_embedding_engine447],
80+
supported_engines=[huggingface_embedding_engine_447],
8181
supported_instances=[
8282
g5dxlarge_instance,
8383
g5d2xlarge_instance,

src/emd/models/engines.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,7 @@ class KtransformersEngine(OpenAICompitableEngine):
507507
"pretrained_tokenizer_init_kwargs":{"trust_remote_code":True}
508508
})
509509

510-
huggingface_embedding_engine447 = HuggingFaceLLMEngine(**{
510+
huggingface_embedding_engine_447 = HuggingFaceLLMEngine(**{
511511
"engine_type":EngineType.HUGGINGFACE,
512512
"engine_cls":"huggingface.embedding.transformers_embedding_backend.TransformerEmbeddingBackend",
513513
"python_name":"python3",

src/pipeline/backend/huggingface/embedding/transformers_embedding_backend.py

Lines changed: 29 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -178,15 +178,18 @@ def _generate_bge_vl_mllm_embeddings(self, inputs):
178178
# Process text-only inputs
179179
if text_inputs:
180180
try:
181-
for text in text_inputs:
181+
with torch.no_grad():
182+
self.model.set_processor(self.model_abs_path)
182183
candidate_inputs = self.model.data_process(
183-
text=text,
184+
text=text_inputs,
184185
q_or_c="c"
185186
)
186-
with torch.no_grad():
187-
text_emb = self.model(**candidate_inputs, output_hidden_states=True)[:, -1, :]
188-
text_emb = torch.nn.functional.normalize(text_emb, dim=-1)
189-
all_embeddings.append(text_emb.cpu().tolist()[0])
187+
text_emb = self.model(**candidate_inputs, output_hidden_states=True)[:, -1, :]
188+
text_emb = torch.nn.functional.normalize(text_emb, dim=-1)
189+
if hasattr(text_emb, 'tolist'):
190+
all_embeddings.extend(text_emb.tolist())
191+
else:
192+
all_embeddings.extend(text_emb)
190193
except Exception as e:
191194
logger.error(f"Failed to encode text inputs with MLLM: {e}")
192195
raise ValueError(f"BGE-VL-MLLM text encoding failed: {e}")
@@ -212,22 +215,26 @@ def _generate_bge_vl_mllm_embeddings(self, inputs):
212215

213216
# Process multimodal inputs (text + image)
214217
if multimodal_inputs:
215-
for text, bytesio_image in multimodal_inputs:
216-
try:
217-
# Convert BytesIO back to PIL Image for MLLM model
218-
pil_image = Image.open(bytesio_image)
219-
candidate_inputs = self.model.data_process(
220-
text=text,
221-
images=[pil_image],
222-
q_or_c="c"
223-
)
224-
with torch.no_grad():
225-
multimodal_emb = self.model(**candidate_inputs, output_hidden_states=True)[:, -1, :]
226-
multimodal_emb = torch.nn.functional.normalize(multimodal_emb, dim=-1)
227-
all_embeddings.append(multimodal_emb.cpu().tolist()[0])
228-
except Exception as e:
229-
logger.error(f"Failed to encode multimodal input with MLLM: {e}")
230-
raise ValueError(f"BGE-VL-MLLM multimodal encoding failed: {e}")
218+
with torch.no_grad():
219+
self.model.set_processor(self.model_abs_path)
220+
for text, bytesio_image in multimodal_inputs:
221+
try:
222+
# Convert BytesIO back to PIL Image for MLLM model
223+
candidate_inputs = self.model.data_process(
224+
text=[text],
225+
images=[bytesio_image],
226+
q_or_c="c"
227+
)
228+
with torch.no_grad():
229+
multimodal_emb = self.model(**candidate_inputs, output_hidden_states=True)[:, -1, :]
230+
multimodal_emb = torch.nn.functional.normalize(multimodal_emb, dim=-1)
231+
if hasattr(multimodal_emb, 'tolist'):
232+
all_embeddings.extend(multimodal_emb.tolist())
233+
else:
234+
all_embeddings.extend(multimodal_emb)
235+
except Exception as e:
236+
logger.error(f"Failed to encode multimodal input with MLLM: {e}")
237+
raise ValueError(f"BGE-VL-MLLM multimodal encoding failed: {e}")
231238

232239
return all_embeddings
233240

0 commit comments

Comments
 (0)