Skip to content

Commit 9f572b3

Browse files
11zhouxuanAoyuQC
andauthored
feat: add qwq-32b (#64)
* Remove forced checking of aws environment during local deployment * merge * modify tgi backend * add g4dn and g6e instances * add qwq-32b * docs: add quick-link * modify qwq-32b-chat-template --------- Co-authored-by: AoyuQC <aoyuzhan@amazon.com>
1 parent 8a4991e commit 9f572b3

File tree

6 files changed

+121
-5
lines changed

6 files changed

+121
-5
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414
<img alt="GitHub contributors" src="https://img.shields.io/github/contributors/aws-samples/easy-model-deployer">
1515
</p>
1616

17+
## 🔥 Latest News
18+
19+
- 2025-03-06: Deploy QwQ-32B with [one command line](docs/en/best_deployment_practices.md##famous-models###qwen-series###qwq-32b).
20+
1721
## Introduction
1822

1923
Easy Model Deployer is a lightweight tool designed to simplify the deployment of **Open-Source LLMs** ([Supported Models](docs/en/supported_models.md)) and Custom Models. It provides **OpenAI's Completions API** and [**LangChain Interface**](https://github.com/langchain-ai/langchain). Built for developers who need reliable and scalable model serving without complex setup, it seamlessly integrates with AWS services for efficient model deployment.

docs/en/best_deployment_practices.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,18 @@
33

44
This document provides examples of best practices for deploying models using EMD for various use cases.
55

6+
## Famous Models
7+
8+
### Qwen Series
9+
10+
#### QwQ-32B
11+
12+
```bash
13+
emd deploy --model-id QwQ-32B --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker
14+
```
15+
16+
17+
618

719
## Deploying to Specific GPU Types
820

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
{%- if tools %}
2+
{{- '<|im_start|>system\n' }}
3+
{%- if messages[0]['role'] == 'system' %}
4+
{{- messages[0]['content'] }}
5+
{%- else %}
6+
{{- '' }}
7+
{%- endif %}
8+
{{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9+
{%- for tool in tools %}
10+
{{- "\n" }}
11+
{{- tool | tojson }}
12+
{%- endfor %}
13+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14+
{%- else %}
15+
{%- if messages[0]['role'] == 'system' %}
16+
{{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17+
{%- endif %}
18+
{%- endif %}
19+
{%- for message in messages %}
20+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
21+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
22+
{%- elif message.role == "assistant" and not message.tool_calls and not loop.last %}
23+
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
24+
{{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
25+
{%- elif message.role == "assistant" and not loop.last %}
26+
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
27+
{{- '<|im_start|>' + message.role }}
28+
{%- if message.content %}
29+
{{- '\n' + content }}
30+
{%- endif %}
31+
{%- for tool_call in message.tool_calls %}
32+
{%- if tool_call.function is defined %}
33+
{%- set tool_call = tool_call.function %}
34+
{%- endif %}
35+
{{- '\n<tool_call>\n{"name": "' }}
36+
{{- tool_call.name }}
37+
{{- '", "arguments": ' }}
38+
{{- tool_call.arguments | tojson }}
39+
{{- '}\n</tool_call>' }}
40+
{%- endfor %}
41+
{{- '<|im_end|>\n' }}
42+
{%- elif message.role == "tool" %}
43+
{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
44+
{{- '<|im_start|>user' }}
45+
{%- endif %}
46+
{{- '\n<tool_response>\n' }}
47+
{{- message.content }}
48+
{{- '\n</tool_response>' }}
49+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
50+
{{- '<|im_end|>\n' }}
51+
{%- endif %}
52+
{%- endif %}
53+
{%- endfor %}
54+
{%- if add_generation_prompt %}
55+
{{- '<|im_start|>assistant\n' }}
56+
{%- if messages[-1].role == "assistant" %}{{- messages[-1].content }}{%- endif %}
57+
{%- endif %}

src/emd/models/engines.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ class ComfyuiEngine(Engine):
6262
})
6363

6464

65+
66+
6567
vllm_deepseek_r1_distill_qwen_engine071 = VllmEngine(**{
6668
**vllm_engine064.model_dump(),
6769
"engine_dockerfile_config": {"VERSION":"v0.7.1"},
@@ -103,6 +105,12 @@ class ComfyuiEngine(Engine):
103105
"default_cli_args": " --max_model_len 25000 --disable-log-stats --limit-mm-per-prompt image=20,video=1 --max_num_seq 1 --gpu_memory_utilization 0.9"
104106
})
105107

108+
vllm_qwq_engine073 = VllmEngine(**{
109+
**vllm_qwen25vl72b_engine073.model_dump(),
110+
"environment_variables": "export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
111+
"default_cli_args": " --chat-template emd/models/chat_templates/qwq_32b_add_prefill_chat_template.jinja --max_model_len 16000 --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser hermes"
112+
})
113+
106114

107115
vllm_internvl2d5_76b_engine064 = VllmEngine(**{
108116
**vllm_engine064.model_dump(),

src/emd/models/llms/qwen.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
tgi_qwen2d5_72b_engine064,
77
tgi_qwen2d5_on_inf2,
88
tgi_qwen2d5_72b_on_inf2,
9-
vllm_qwen2d5_72b_engine064
9+
vllm_qwen2d5_72b_engine064,
10+
vllm_qwq_engine073
1011
)
1112
from ..services import (
1213
sagemaker_service,
@@ -436,7 +437,7 @@
436437
Model.register(
437438
dict(
438439
model_id = "QwQ-32B-Preview",
439-
supported_engines=[huggingface_llm_engine_4d41d2,vllm_qwen2d5_engine064],
440+
supported_engines=[vllm_qwq_engine073],
440441
supported_instances=[
441442
g5d12xlarge_instance,
442443
g5d24xlarge_instance,
@@ -462,3 +463,33 @@
462463
model_series=QWEN_REASONING_MODEL
463464
)
464465
)
466+
467+
Model.register(
468+
dict(
469+
model_id = "QwQ-32B",
470+
supported_engines=[vllm_qwq_engine073],
471+
supported_instances=[
472+
g5d12xlarge_instance,
473+
g5d24xlarge_instance,
474+
g5d48xlarge_instance,
475+
local_instance
476+
],
477+
supported_services=[
478+
sagemaker_service,
479+
sagemaker_async_service,
480+
ecs_service,
481+
local_service
482+
],
483+
supported_frameworks=[
484+
fastapi_framework
485+
],
486+
allow_china_region=True,
487+
huggingface_model_id="Qwen/QwQ-32B",
488+
modelscope_model_id="Qwen/QwQ-32B",
489+
require_huggingface_token=False,
490+
application_scenario="large reasoning model",
491+
description="large reasoning model provide by qwen team",
492+
model_type=ModelType.LLM,
493+
model_series=QWEN_REASONING_MODEL
494+
)
495+
)

tests/sdk_tests/client_tests/openai_client_test.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,18 @@ def chat_with_openai_stream(prompt):
88
api_key=api_key,
99
# base_url="http://127.0.0.1:8080/v1"
1010
# base_url="http://127.0.0.1:8080/v1"
11-
base_url="http://ec2-54-189-171-204.us-west-2.compute.amazonaws.com:8080/v1"
11+
base_url="http://ec2-54-202-58-38.us-west-2.compute.amazonaws.com:8080/v1"
1212
)
1313

1414
response = client.chat.completions.create(
1515
# model="DeepSeek-R1-Distill-Qwen-1.5B",
16-
model="Qwen2.5-72B-Instruct-AWQ",
16+
# model="Qwen2.5-72B-Instruct-AWQ",
17+
model="QwQ-32B",
1718
# model="Qwen2.5-1.5B-Instruct",
1819
messages=[
1920
# {"role": "system", "content": "You are a helpful assistant."},
20-
{"role": "user", "content": prompt}
21+
{"role": "user", "content": prompt},
22+
# {"role": "assistant", "content": "<think>\n"}
2123
],
2224
stream=True,
2325
temperature=0.6
@@ -26,6 +28,7 @@ def chat_with_openai_stream(prompt):
2628
print("AI: ", end="", flush=True)
2729
print(response)
2830
for chunk in response:
31+
# print(chunk)
2932
# print(sfbdfb)
3033
# print(type(chunk))
3134
content = chunk.choices[0].delta.content
@@ -59,5 +62,6 @@ def chat_with_openai(prompt):
5962

6063
print(response)
6164
# 测试调用
65+
# chat_with_openai_stream("9.11和9.9哪个更大?")
6266
chat_with_openai_stream("你好")
6367
# chat_with_openai("你好")

0 commit comments

Comments
 (0)