feat: add qwq-32b (#64)

11zhouxuan · AoyuQC · web-flow · commit 9f572b32b632 · 2025-03-07T10:35:42.000+08:00
* Remove forced checking of aws environment during local deployment

* merge

* modify tgi backend

* add g4dn and g6e instances

* add qwq-32b

* docs: add quick-link

* modify qwq-32b-chat-template

---------

Co-authored-by: AoyuQC &lt;aoyuzhan@amazon.com&gt;
diff --git a/README.md b/README.md
@@ -14,6 +14,10 @@
   <img alt="GitHub contributors" src="https://img.shields.io/github/contributors/aws-samples/easy-model-deployer">
 </p>
 
+## 🔥 Latest News
+
+- 2025-03-06: Deploy QwQ-32B with [one command line](docs/en/best_deployment_practices.md##famous-models###qwen-series###qwq-32b).
+
 ## Introduction
 
 Easy Model Deployer is a lightweight tool designed to simplify the deployment of **Open-Source LLMs** ([Supported Models](docs/en/supported_models.md)) and Custom Models. It provides **OpenAI's Completions API** and [**LangChain Interface**](https://github.com/langchain-ai/langchain). Built for developers who need reliable and scalable model serving without complex setup, it seamlessly integrates with AWS services for efficient model deployment.
diff --git a/docs/en/best_deployment_practices.md b/docs/en/best_deployment_practices.md
@@ -3,6 +3,18 @@
 
 This document provides examples of best practices for deploying models using EMD for various use cases.
 
+## Famous Models
+
+### Qwen Series
+
+#### QwQ-32B
+
+```bash
+emd deploy --model-id QwQ-32B --instance-type g5.12xlarge --engine-type vllm --service-type sagemaker
+```
+
+
+
 
 ## Deploying to Specific GPU Types
 
diff --git a/src/emd/models/chat_templates/qwq_32b_add_prefill_chat_template.jinja b/src/emd/models/chat_templates/qwq_32b_add_prefill_chat_template.jinja
@@ -0,0 +1,57 @@
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- '' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+  {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" and not message.tool_calls and not loop.last %}
+        {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" and not loop.last %}
+        {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if messages[-1].role == "assistant" %}{{- messages[-1].content }}{%- endif %}
+{%- endif %}
diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py
@@ -62,6 +62,8 @@ class ComfyuiEngine(Engine):
 })
 
 
+
+
 vllm_deepseek_r1_distill_qwen_engine071 = VllmEngine(**{
             **vllm_engine064.model_dump(),
             "engine_dockerfile_config": {"VERSION":"v0.7.1"},
@@ -103,6 +105,12 @@ class ComfyuiEngine(Engine):
             "default_cli_args": " --max_model_len 25000 --disable-log-stats --limit-mm-per-prompt image=20,video=1 --max_num_seq 1 --gpu_memory_utilization 0.9"
 })
 
+vllm_qwq_engine073 = VllmEngine(**{
+            **vllm_qwen25vl72b_engine073.model_dump(),
+            "environment_variables": "export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
+            "default_cli_args": " --chat-template emd/models/chat_templates/qwq_32b_add_prefill_chat_template.jinja --max_model_len 16000  --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser hermes"
+})
+
 
 vllm_internvl2d5_76b_engine064 = VllmEngine(**{
              **vllm_engine064.model_dump(),
diff --git a/src/emd/models/llms/qwen.py b/src/emd/models/llms/qwen.py
@@ -6,7 +6,8 @@
     tgi_qwen2d5_72b_engine064,
     tgi_qwen2d5_on_inf2,
     tgi_qwen2d5_72b_on_inf2,
-    vllm_qwen2d5_72b_engine064
+    vllm_qwen2d5_72b_engine064,
+    vllm_qwq_engine073
 )
 from ..services import (
     sagemaker_service,
@@ -436,7 +437,7 @@
 Model.register(
     dict(
         model_id = "QwQ-32B-Preview",
-        supported_engines=[huggingface_llm_engine_4d41d2,vllm_qwen2d5_engine064],
+        supported_engines=[vllm_qwq_engine073],
         supported_instances=[
             g5d12xlarge_instance,
             g5d24xlarge_instance,
@@ -462,3 +463,33 @@
         model_series=QWEN_REASONING_MODEL
     )
 )
+
+Model.register(
+    dict(
+        model_id = "QwQ-32B",
+        supported_engines=[vllm_qwq_engine073],
+        supported_instances=[
+            g5d12xlarge_instance,
+            g5d24xlarge_instance,
+            g5d48xlarge_instance,
+            local_instance
+        ],
+        supported_services=[
+            sagemaker_service,
+            sagemaker_async_service,
+            ecs_service,
+            local_service
+        ],
+        supported_frameworks=[
+            fastapi_framework
+        ],
+        allow_china_region=True,
+        huggingface_model_id="Qwen/QwQ-32B",
+        modelscope_model_id="Qwen/QwQ-32B",
+        require_huggingface_token=False,
+        application_scenario="large reasoning model",
+        description="large reasoning model provide by qwen team",
+        model_type=ModelType.LLM,
+        model_series=QWEN_REASONING_MODEL
+    )
+)
diff --git a/tests/sdk_tests/client_tests/openai_client_test.py b/tests/sdk_tests/client_tests/openai_client_test.py
@@ -8,16 +8,18 @@ def chat_with_openai_stream(prompt):
         api_key=api_key,
         # base_url="http://127.0.0.1:8080/v1"
         # base_url="http://127.0.0.1:8080/v1"
-        base_url="http://ec2-54-189-171-204.us-west-2.compute.amazonaws.com:8080/v1"
+        base_url="http://ec2-54-202-58-38.us-west-2.compute.amazonaws.com:8080/v1"
     )
 
     response = client.chat.completions.create(
         # model="DeepSeek-R1-Distill-Qwen-1.5B",
-        model="Qwen2.5-72B-Instruct-AWQ",
+        # model="Qwen2.5-72B-Instruct-AWQ",
+        model="QwQ-32B",
         # model="Qwen2.5-1.5B-Instruct",
         messages=[
             # {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": prompt}
+        {"role": "user", "content": prompt},
+        # {"role": "assistant", "content": "<think>\n"}
         ],
         stream=True,
         temperature=0.6
@@ -26,6 +28,7 @@ def chat_with_openai_stream(prompt):
     print("AI: ", end="", flush=True)
     print(response)
     for chunk in response:
+        # print(chunk)
         # print(sfbdfb)
         # print(type(chunk))
         content = chunk.choices[0].delta.content
@@ -59,5 +62,6 @@ def chat_with_openai(prompt):
 
     print(response)
 # 测试调用
+# chat_with_openai_stream("9.11和9.9哪个更大？")
 chat_with_openai_stream("你好")
 # chat_with_openai("你好")