feat: model dots.ocr

yanbasic · yanbasic · commit 39d0b74a35ee · 2025-08-15T23:20:51.000+08:00
diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py
@@ -545,6 +545,16 @@ class KtransformersEngine(OpenAICompitableEngine):
             "default_cli_args": " --max_new_tokens 2048",
 })
 
+# VLLM Engine v0.9.1 for dots.ocr
+vllm_dots_ocr_engine091 = VllmEngine(**{
+    **vllm_engine064.model_dump(),
+    "engine_dockerfile_config": {"VERSION":"v0.9.1"},
+    "dockerfile_name": "Dockerfile_dots_ocr",
+    "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
+    "default_cli_args": " --trust-remote-code --chat-template-content-format string --gpu-memory-utilization 0.95 --max_model_len 8192 --disable-log-stats --max_num_seq 5 --enforce-eager",
+    "description": "VLLM v0.9.1 engine for dots.ocr multilingual document parsing model with flash-attn support and eager execution for custom models"
+})
+
 custom_engine = Engine(**{
             "engine_type":EngineType.CUSTOM,
 })
diff --git a/src/emd/models/model_series.py b/src/emd/models/model_series.py
@@ -157,3 +157,9 @@
     description="General Multimodal Embedding (GME) models based on Qwen2-VL architecture, designed for unified multimodal representation learning supporting text, image, and image-text pair inputs for retrieval and search applications.",
     reference_link="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct"
 )
+
+DOTS_OCR_SERIES = ModelSeries(
+    model_series_name=ModelSeriesType.DOTS_OCR,
+    description="dots.ocr is a powerful, multilingual document parser that unifies layout detection and content recognition within a single vision-language model while maintaining good reading order. Despite its compact 1.7B-parameter LLM foundation, it achieves state-of-the-art(SOTA) performance on text, tables, and reading order tasks with multilingual support for over 100 languages.",
+    reference_link="https://github.com/rednote-hilab/dots.ocr"
+)
diff --git a/src/emd/models/utils/constants.py b/src/emd/models/utils/constants.py
@@ -235,3 +235,4 @@ class ModelSeriesType(ConstantBase):
     DEEPSEEK_REASONING_MODEL = "deepseek reasoning model"
     DEEPSEEK_v3 = "deepseek v3"
     BAICHUAN = "baichuan"
+    DOTS_OCR = "dots_ocr"
diff --git a/src/emd/models/vlms/__init__.py b/src/emd/models/vlms/__init__.py
@@ -2,3 +2,4 @@
 from . import internvl
 from . import gemma3
 from . import mistral
+from . import dots_ocr
diff --git a/src/emd/models/vlms/dots_ocr.py b/src/emd/models/vlms/dots_ocr.py
@@ -0,0 +1,47 @@
+from .. import Model
+from ..model_series import DOTS_OCR_SERIES
+from ..engines import vllm_dots_ocr_engine091, huggingface_llm_engine_4d41d2
+from ..instances import (
+    g5d2xlarge_instance,
+    g5d4xlarge_instance,
+    g5d8xlarge_instance,
+    g5d12xlarge_instance,
+    g5d16xlarge_instance,
+    g5d24xlarge_instance,
+    g5d48xlarge_instance,
+    local_instance
+)
+from ..services import (
+    sagemaker_service,
+    sagemaker_async_service,
+    ecs_service,
+    local_service
+)
+from ..frameworks import fastapi_framework
+from emd.models.utils.constants import ModelType
+
+Model.register(
+    dict(
+        model_id="rednote-hilab-dots-ocr",
+        model_type=ModelType.VLM,
+        description="dots.ocr is a powerful, multilingual document parser that unifies layout detection and content recognition within a single vision-language model. Built on a compact 1.7B-parameter LLM foundation, it achieves state-of-the-art performance on text, tables, and reading order tasks with support for over 100 languages including English, Chinese, and many others.",
+        application_scenario="multilingual document layout parsing, OCR, document understanding, table extraction, formula recognition, reading order detection",
+        supported_engines=[vllm_dots_ocr_engine091, huggingface_llm_engine_4d41d2],
+        supported_instances=[
+            g5d2xlarge_instance, g5d4xlarge_instance, g5d8xlarge_instance,
+            g5d12xlarge_instance, g5d16xlarge_instance, g5d24xlarge_instance,
+            g5d48xlarge_instance, local_instance
+        ],
+        supported_services=[
+            sagemaker_service, sagemaker_async_service, ecs_service, local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="rednote-hilab/dots.ocr",
+        modelscope_model_id="rednote-hilab/dots.ocr",
+        require_huggingface_token=False,
+        model_series=DOTS_OCR_SERIES
+    )
+)
diff --git a/src/pipeline/backend/vllm/Dockerfile_dots_ocr b/src/pipeline/backend/vllm/Dockerfile_dots_ocr
@@ -0,0 +1,13 @@
+FROM vllm/vllm-openai:{{VERSION}} AS vllm-base
+
+RUN pip3 install flash_attn==2.8.0.post2
+RUN pip3 install transformers==4.51.3
+
+FROM vllm-base AS sagemaker-serving
+
+RUN pip install boto3 hf_transfer modelscope
+
+EXPOSE 8080
+WORKDIR /opt/ml/code
+
+ENTRYPOINT ["/usr/bin/serve"]