Skip to content

Commit 39d0b74

Browse files
committed
feat: model dots.ocr
1 parent 15fe27e commit 39d0b74

File tree

6 files changed

+78
-0
lines changed

6 files changed

+78
-0
lines changed

src/emd/models/engines.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,16 @@ class KtransformersEngine(OpenAICompitableEngine):
545545
"default_cli_args": " --max_new_tokens 2048",
546546
})
547547

548+
# VLLM Engine v0.9.1 for dots.ocr
549+
vllm_dots_ocr_engine091 = VllmEngine(**{
550+
**vllm_engine064.model_dump(),
551+
"engine_dockerfile_config": {"VERSION":"v0.9.1"},
552+
"dockerfile_name": "Dockerfile_dots_ocr",
553+
"environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
554+
"default_cli_args": " --trust-remote-code --chat-template-content-format string --gpu-memory-utilization 0.95 --max_model_len 8192 --disable-log-stats --max_num_seq 5 --enforce-eager",
555+
"description": "VLLM v0.9.1 engine for dots.ocr multilingual document parsing model with flash-attn support and eager execution for custom models"
556+
})
557+
548558
custom_engine = Engine(**{
549559
"engine_type":EngineType.CUSTOM,
550560
})

src/emd/models/model_series.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,3 +157,9 @@
157157
description="General Multimodal Embedding (GME) models based on Qwen2-VL architecture, designed for unified multimodal representation learning supporting text, image, and image-text pair inputs for retrieval and search applications.",
158158
reference_link="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct"
159159
)
160+
161+
DOTS_OCR_SERIES = ModelSeries(
162+
model_series_name=ModelSeriesType.DOTS_OCR,
163+
description="dots.ocr is a powerful, multilingual document parser that unifies layout detection and content recognition within a single vision-language model while maintaining good reading order. Despite its compact 1.7B-parameter LLM foundation, it achieves state-of-the-art(SOTA) performance on text, tables, and reading order tasks with multilingual support for over 100 languages.",
164+
reference_link="https://github.com/rednote-hilab/dots.ocr"
165+
)

src/emd/models/utils/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,3 +235,4 @@ class ModelSeriesType(ConstantBase):
235235
DEEPSEEK_REASONING_MODEL = "deepseek reasoning model"
236236
DEEPSEEK_v3 = "deepseek v3"
237237
BAICHUAN = "baichuan"
238+
DOTS_OCR = "dots_ocr"

src/emd/models/vlms/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@
22
from . import internvl
33
from . import gemma3
44
from . import mistral
5+
from . import dots_ocr

src/emd/models/vlms/dots_ocr.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
from .. import Model
2+
from ..model_series import DOTS_OCR_SERIES
3+
from ..engines import vllm_dots_ocr_engine091, huggingface_llm_engine_4d41d2
4+
from ..instances import (
5+
g5d2xlarge_instance,
6+
g5d4xlarge_instance,
7+
g5d8xlarge_instance,
8+
g5d12xlarge_instance,
9+
g5d16xlarge_instance,
10+
g5d24xlarge_instance,
11+
g5d48xlarge_instance,
12+
local_instance
13+
)
14+
from ..services import (
15+
sagemaker_service,
16+
sagemaker_async_service,
17+
ecs_service,
18+
local_service
19+
)
20+
from ..frameworks import fastapi_framework
21+
from emd.models.utils.constants import ModelType
22+
23+
Model.register(
24+
dict(
25+
model_id="rednote-hilab-dots-ocr",
26+
model_type=ModelType.VLM,
27+
description="dots.ocr is a powerful, multilingual document parser that unifies layout detection and content recognition within a single vision-language model. Built on a compact 1.7B-parameter LLM foundation, it achieves state-of-the-art performance on text, tables, and reading order tasks with support for over 100 languages including English, Chinese, and many others.",
28+
application_scenario="multilingual document layout parsing, OCR, document understanding, table extraction, formula recognition, reading order detection",
29+
supported_engines=[vllm_dots_ocr_engine091, huggingface_llm_engine_4d41d2],
30+
supported_instances=[
31+
g5d2xlarge_instance, g5d4xlarge_instance, g5d8xlarge_instance,
32+
g5d12xlarge_instance, g5d16xlarge_instance, g5d24xlarge_instance,
33+
g5d48xlarge_instance, local_instance
34+
],
35+
supported_services=[
36+
sagemaker_service, sagemaker_async_service, ecs_service, local_service
37+
],
38+
supported_frameworks=[
39+
fastapi_framework
40+
],
41+
allow_china_region=True,
42+
huggingface_model_id="rednote-hilab/dots.ocr",
43+
modelscope_model_id="rednote-hilab/dots.ocr",
44+
require_huggingface_token=False,
45+
model_series=DOTS_OCR_SERIES
46+
)
47+
)
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
FROM vllm/vllm-openai:{{VERSION}} AS vllm-base
2+
3+
RUN pip3 install flash_attn==2.8.0.post2
4+
RUN pip3 install transformers==4.51.3
5+
6+
FROM vllm-base AS sagemaker-serving
7+
8+
RUN pip install boto3 hf_transfer modelscope
9+
10+
EXPOSE 8080
11+
WORKDIR /opt/ml/code
12+
13+
ENTRYPOINT ["/usr/bin/serve"]

0 commit comments

Comments
 (0)