feat: add huggingface_embedding_engine_447, fix text and multimodal input

zhu-xiaowei · zhu-xiaowei · commit aa095b796090 · 2025-07-24T14:51:35.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -134,3 +134,4 @@ emd_models/
 **artifacts
 src/pipeline/emd
 *.log
+.venv-vl/*
diff --git a/src/emd/models/embeddings/bge_vl.py b/src/emd/models/embeddings/bge_vl.py
@@ -1,5 +1,5 @@
 from .. import Model
-from ..engines import huggingface_embedding_engine449, huggingface_embedding_engine447
+from ..engines import huggingface_embedding_engine449, huggingface_embedding_engine_447
 from ..services import sagemaker_service, local_service, ecs_service
 from ..frameworks import fastapi_framework
 from ..instances import (
@@ -77,7 +77,7 @@
 Model.register(
     dict(
         model_id="bge-vl-mllm-s1",
-        supported_engines=[huggingface_embedding_engine447],
+        supported_engines=[huggingface_embedding_engine_447],
         supported_instances=[
             g5dxlarge_instance,
             g5d2xlarge_instance,
diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py
@@ -507,7 +507,7 @@ class KtransformersEngine(OpenAICompitableEngine):
             "pretrained_tokenizer_init_kwargs":{"trust_remote_code":True}
 })
 
-huggingface_embedding_engine447 = HuggingFaceLLMEngine(**{
+huggingface_embedding_engine_447 = HuggingFaceLLMEngine(**{
             "engine_type":EngineType.HUGGINGFACE,
             "engine_cls":"huggingface.embedding.transformers_embedding_backend.TransformerEmbeddingBackend",
             "python_name":"python3",
diff --git a/src/pipeline/backend/huggingface/embedding/transformers_embedding_backend.py b/src/pipeline/backend/huggingface/embedding/transformers_embedding_backend.py
@@ -178,15 +178,18 @@ def _generate_bge_vl_mllm_embeddings(self, inputs):
         # Process text-only inputs
         if text_inputs:
             try:
-                for text in text_inputs:
+                with torch.no_grad():
+                    self.model.set_processor(self.model_abs_path)
                     candidate_inputs = self.model.data_process(
-                        text=text,
+                        text=text_inputs,
                         q_or_c="c"
                     )
-                    with torch.no_grad():
-                        text_emb = self.model(**candidate_inputs, output_hidden_states=True)[:, -1, :]
-                        text_emb = torch.nn.functional.normalize(text_emb, dim=-1)
-                        all_embeddings.append(text_emb.cpu().tolist()[0])
+                    text_emb = self.model(**candidate_inputs, output_hidden_states=True)[:, -1, :]
+                    text_emb = torch.nn.functional.normalize(text_emb, dim=-1)
+                    if hasattr(text_emb, 'tolist'):
+                        all_embeddings.extend(text_emb.tolist())
+                    else:
+                        all_embeddings.extend(text_emb)
             except Exception as e:
                 logger.error(f"Failed to encode text inputs with MLLM: {e}")
                 raise ValueError(f"BGE-VL-MLLM text encoding failed: {e}")
@@ -212,22 +215,26 @@ def _generate_bge_vl_mllm_embeddings(self, inputs):
 
         # Process multimodal inputs (text + image)
         if multimodal_inputs:
-            for text, bytesio_image in multimodal_inputs:
-                try:
-                    # Convert BytesIO back to PIL Image for MLLM model
-                    pil_image = Image.open(bytesio_image)
-                    candidate_inputs = self.model.data_process(
-                        text=text,
-                        images=[pil_image],
-                        q_or_c="c"
-                    )
-                    with torch.no_grad():
-                        multimodal_emb = self.model(**candidate_inputs, output_hidden_states=True)[:, -1, :]
-                        multimodal_emb = torch.nn.functional.normalize(multimodal_emb, dim=-1)
-                        all_embeddings.append(multimodal_emb.cpu().tolist()[0])
-                except Exception as e:
-                    logger.error(f"Failed to encode multimodal input with MLLM: {e}")
-                    raise ValueError(f"BGE-VL-MLLM multimodal encoding failed: {e}")
+            with torch.no_grad():
+                self.model.set_processor(self.model_abs_path)
+                for text, bytesio_image in multimodal_inputs:
+                    try:
+                        # Convert BytesIO back to PIL Image for MLLM model
+                        candidate_inputs = self.model.data_process(
+                            text=[text],
+                            images=[bytesio_image],
+                            q_or_c="c"
+                        )
+                        with torch.no_grad():
+                            multimodal_emb = self.model(**candidate_inputs, output_hidden_states=True)[:, -1, :]
+                            multimodal_emb = torch.nn.functional.normalize(multimodal_emb, dim=-1)
+                            if hasattr(multimodal_emb, 'tolist'):
+                                all_embeddings.extend(multimodal_emb.tolist())
+                            else:
+                                all_embeddings.extend(multimodal_emb)
+                    except Exception as e:
+                        logger.error(f"Failed to encode multimodal input with MLLM: {e}")
+                        raise ValueError(f"BGE-VL-MLLM multimodal encoding failed: {e}")
 
         return all_embeddings