fix: revert vllm engine to 084v for gme qwen2vl

yanbasic · yanbasic · commit 78d761028b4a · 2025-07-18T10:20:28.000+08:00
diff --git a/src/emd/models/embeddings/qwen.py b/src/emd/models/embeddings/qwen.py
@@ -1,5 +1,5 @@
 from .. import Model
-from ..engines import vllm_qwen3_engin091, vllm_gme_qwen2vl_engine091
+from ..engines import vllm_qwen3_engin091, vllm_gme_qwen2vl_engine091, vllm_gme_qwen2vl_engine084_compat
 from ..services import sagemaker_service,local_service,ecs_service
 from ..frameworks import fastapi_framework
 from ..instances import (
@@ -109,7 +109,7 @@
 Model.register(
     dict(
         model_id = "gme-Qwen2-VL-7B-Instruct",
-        supported_engines=[vllm_gme_qwen2vl_engine091],
+        supported_engines=[vllm_gme_qwen2vl_engine084_compat],
         supported_instances=[
             g5d4xlarge_instance,
             g5d8xlarge_instance,
@@ -130,7 +130,7 @@
         modelscope_model_id="Alibaba-NLP/gme-Qwen2-VL-7B-Instruct",
         require_huggingface_token=False,
         application_scenario="Multimodal RAG, image-text retrieval, visual search",
-        description="General Multimodal Embedding model based on Qwen2-VL architecture, supporting text, image, and image-text pair inputs for unified multimodal representation learning and retrieval tasks.",
+        description="General Multimodal Embedding model based on Qwen2-VL architecture, supporting text, image, and image-text pair inputs for unified multimodal representation learning and retrieval tasks. Uses vLLM v0.8.4 for transformers compatibility.",
         model_type=ModelType.EMBEDDING,
         model_series=GME_SERIES
     )
diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py
@@ -190,6 +190,15 @@ class KtransformersEngine(OpenAICompitableEngine):
             "description": "VLLM engine for GME multimodal embedding models based on Qwen2-VL"
 })
 
+# GME-compatible engine with transformers 4.51.3
+vllm_gme_qwen2vl_engine084_compat = VllmEngine(**{
+            **vllm_embedding_engine091.model_dump(),
+            "engine_dockerfile_config": {"VERSION":"v0.8.4"},
+            "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
+            "default_cli_args": " --max_num_seq 20 --disable-log-stats --trust-remote-code --task embed --limit-mm-per-prompt image=10 --gpu_memory_utilization 0.8",
+            "description": "VLLM engine v0.8.4 for GME multimodal embedding models with compatible transformers version"
+})
+
 
 vllm_qwen2vl72b_engine064 = VllmEngine(**{
              **vllm_engine064.model_dump(),