PaddlePaddle · Kane2011 · Aug 18, 2025 · Aug 19, 2025 · Aug 19, 2025 · Aug 18, 2025
diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py
@@ -589,6 +589,12 @@ def find_end_files(directory, end_str):
         if not os.listdir(json_dir):
             raise ValueError("Git clone nlohmann_json failed!")
     sources = [
+        "gpu_ops/update_inputs_v1.cu",
+        "gpu_ops/save_with_output_msg.cc",
+        "gpu_ops/get_output.cc",
+        "gpu_ops/get_output_msg_with_topk.cc",
+        "gpu_ops/save_output_msg_with_topk.cc",
+        "gpu_ops/transfer_output.cc",
         "gpu_ops/save_with_output.cc",
         "gpu_ops/set_mask_value.cu",
         "gpu_ops/set_value_by_flags.cu",

diff --git a/docs/features/multi-node_deployment.md b/docs/features/multi-node_deployment.md
@@ -1,71 +1,71 @@
 # Multi-Node Deployment
 
-## Overview  
+## Overview
 Multi-node deployment addresses scenarios where a single machine's GPU memory is insufficient to support deployment of large models by enabling tensor parallelism across multiple machines.
 
-## Environment Preparation  
-#### Network Requirements  
-1. All nodes must be within the same local network  
-2. Ensure bidirectional connectivity between all nodes (test using `ping` and `nc -zv`)  
+## Environment Preparation
+### Network Requirements
+1. All nodes must be within the same local network
+2. Ensure bidirectional connectivity between all nodes (test using `ping` and `nc -zv`)
 
-#### Software Requirements  
-1. Install the same version of FastDeploy on all nodes  
-2. [Recommended] Install and configure MPI (OpenMPI or MPICH)  
+#### Software Requirements
+1. Install the same version of FastDeploy on all nodes
+2. [Recommended] Install and configure MPI (OpenMPI or MPICH)
 
-## Tensor Parallel Deployment  
+## Tensor Parallel Deployment
 
-### Recommended Launch Method  
-We recommend using mpirun for one-command startup without manually starting each node.  
+### Recommended Launch Method
+We recommend using mpirun for one-command startup without manually starting each node.
 
-### Usage Instructions  
-1. Execute the same command on all machines  
-2. The IP order in the `ips` parameter determines the node startup sequence  
-3. The first IP will be designated as the master node  
-4. Ensure all nodes can resolve each other's hostnames  
+### Usage Instructions
+1. Execute the same command on all machines
+2. The IP order in the `ips` parameter determines the node startup sequence
+3. The first IP will be designated as the master node
+4. Ensure all nodes can resolve each other's hostnames
 
-* Online inference startup example:  
-    ```shell  
-    python -m fastdeploy.entrypoints.openai.api_server \  
-    --model baidu/ERNIE-4.5-300B-A47B-Paddle \  
-    --port 8180 \  
-    --metrics-port 8181 \  
-    --engine-worker-queue-port 8182 \  
-    --max-model-len 32768 \  
-    --max-num-seqs 32 \  
-    --tensor-parallel-size 16 \  
-    --ips 192.168.1.101,192.168.1.102  
-    ```  
+* Online inference startup example:
+    ```shell
+    python -m fastdeploy.entrypoints.openai.api_server \
+    --model baidu/ERNIE-4.5-300B-A47B-Paddle \
+    --port 8180 \
+    --metrics-port 8181 \
+    --engine-worker-queue-port 8182 \
+    --max-model-len 32768 \
+    --max-num-seqs 32 \
+    --tensor-parallel-size 16 \
+    --ips 192.168.1.101,192.168.1.102
+    ```
 
-* Offline startup example:  
-    ```python  
-    from fastdeploy.engine.sampling_params import SamplingParams  
-    from fastdeploy.entrypoints.llm import LLM  
-
-    model_name_or_path = "baidu/ERNIE-4.5-300B-A47B-Paddle"  
-
-    sampling_params = SamplingParams(temperature=0.1, max_tokens=30)  
-    llm = LLM(model=model_name_or_path, tensor_parallel_size=16, ips="192.168.1.101,192.168.1.102")  
-    if llm._check_master():  
-        output = llm.generate(prompts="Who are you?", use_tqdm=True, sampling_params=sampling_params)  
-        print(output)  
-    ```  
+* Offline startup example:
+    ```python
+    from fastdeploy.engine.sampling_params import SamplingParams
+    from fastdeploy.entrypoints.llm import LLM
 
-* Notes:  
-- Only the master node can receive completion requests  
-- Always send requests to the master node (the first IP in the ips list)  
-- The master node will distribute workloads across all nodes  
+    model_name_or_path = "baidu/ERNIE-4.5-300B-A47B-Paddle"
 
-### Parameter Description  
+    sampling_params = SamplingParams(temperature=0.1, max_tokens=30)
+    llm = LLM(model=model_name_or_path, tensor_parallel_size=16, ips="192.168.1.101,192.168.1.102")
+    if llm._check_master():
+        output = llm.generate(prompts="Who are you?", use_tqdm=True, sampling_params=sampling_params)
+        print(output)
+    ```
 
-#### `ips` Parameter  
-- **Type**: `string`  
-- **Format**: Comma-separated IPv4 addresses  
-- **Description**: Specifies the IP addresses of all nodes in the deployment group  
-- **Required**: Only for multi-node deployments  
-- **Example**: `"192.168.1.101,192.168.1.102,192.168.1.103"`  
+* Notes:
+* Only the master node can receive completion requests
+* Always send requests to the master node (the first IP in the ips list)
+* The master node will distribute workloads across all nodes
 
-#### `tensor_parallel_size` Parameter  
-- **Type**: `integer`  
-- **Description**: Total number of GPUs across all nodes  
-- **Required**: Yes  
-- **Example**: For 2 nodes with 8 GPUs each, set to 16
+### Parameter Description
+
+#### `ips` Parameter
+* **Type**: `string`
+* **Format**: Comma-separated IPv4 addresses
+* **Description**: Specifies the IP addresses of all nodes in the deployment group
+* **Required**: Only for multi-node deployments
+* **Example**: `"192.168.1.101,192.168.1.102,192.168.1.103"`
+
+#### `tensor_parallel_size` Parameter
+* **Type**: `integer`
+* **Description**: Total number of GPUs across all nodes
+* **Required**: Yes
+* **Example**: For 2 nodes with 8 GPUs each, set to 16
diff --git a/docs/get_started/installation/metax_gpu.md b/docs/get_started/installation/metax_gpu.md
@@ -0,0 +1,83 @@
+# Metax GPU Installation for running ERNIE 4.5 Series Models
+
+The following installation methods are available when your environment meets these requirements:
+- Python >= 3.10
+- Linux X86_64
+
+Before starting, prepare a machine equipped with Enflame S60 accelerator cards. Requirements:
+
+| Chip Type | Driver Version | KMD Version |
+| :---: | :---: | :---: |
+| MetaX C550 | 3.0.0.1  | 2.14.6 |
+
+## 1. Pre-built Docker Installation (Recommended)
+
+```shell
+docker login --username=cr_temp_user --password=eyJpbnN0YW5jZUlkIjoiY3JpLXpxYTIzejI2YTU5M3R3M2QiLCJ0aW1lIjoiMTc1NTUxODEwODAwMCIsInR5cGUiOiJzdWIiLCJ1c2VySWQiOiIyMDcwOTQwMTA1NjYzNDE3OTIifQ:8226ca50ce5476c42062e24d3c465545de1c1780 cr.metax-tech.com && docker pull cr.metax-tech.com/public-library/maca-native:3.0.0.4-ubuntu20.04-amd64
+```
+
+## 2. paddlepaddle and custom device installation
+
+```shell
+1）pip install paddlepaddle==3.0.0.dev20250729 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
+2）pip install paddle-metax-gpu==3.0.0.dev20250807 -i https://www.paddlepaddle.org.cn/packages/nightly/maca/
+```
+
+## 3. Build Wheel from Source
+Then clone the source code and build:
+```shell
+git clone https://github.com/PaddlePaddle/FastDeploy
+cd FastDeploy
+bash build.sh
+```
+The built packages will be in the ```FastDeploy/dist``` directory.
+
+## 4. Environment Verification
+
+After installation, verify the environment with this Python code:
+```python
+import paddle
+from paddle.jit.marker import unified
+# Verify GPU availability
+paddle.utils.run_check()
+# Verify FastDeploy custom operators compilation
+from fastdeploy.model_executor.ops.gpu import beam_search_softmax
+```
+
+If the above code executes successfully, the environment is ready.
+
+## 5. Demo
+from fastdeploy import LLM, SamplingParams
+
+prompts = [
+    "Hello. My name is",
+]
+
+sampling_params = SamplingParams(top_p=0.95, max_tokens=32, temperature=0.6)
+
+llm = LLM(model="/root/model/ERNIE-4.5-21B-A3B-Paddle", tensor_parallel_size=1, max_model_len=256, engine_worker_queue_port=9135, quantization='wint8', static_decode_blocks=0, gpu_memory_utilization=0.9)
+
+outputs = llm.generate(prompts, sampling_params)
+
+print(f"Generated {len(outputs)} outputs")
+print("=" * 50 + "\n")
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs.text
+    print(prompt)
+    print(generated_text)
+    print("-" * 50)
+
+Output：
+INFO     2025-08-18 10:54:18,455 416822 engine.py[line:202] Waiting worker processes ready...
+Loading Weights: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [03:33<00:00,  2.14s/it]
+Loading Layers: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:18<00:00,  5.54it/s]
+INFO     2025-08-18 10:58:16,149 416822 engine.py[line:247] Worker processes are launched with 240.08204197883606 seconds.
+Processed prompts: 100%|███████████████████████| 1/1 [00:21<00:00, 21.84s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
+Generated 1 outputs
+==================================================
+
+Hello. My name is
+Alice and I'm here to help you. What can I do for you today?
+Hello Alice! I'm trying to organize a small party
diff --git a/docs/zh/features/multi-node_deployment.md b/docs/zh/features/multi-node_deployment.md
@@ -4,11 +4,10 @@
 多节点部署旨在解决单个机器GPU显存不足时，支持跨多台机器的张量并行执行。
 
 ## 环境准备
-#### 网络要求
+### 网络要求
 1. 所有节点必须在同一本地网络中
 2. 确保所有节点之间双向连通（可使用`ping`和`nc -zv`测试）
 
-
 #### 软件要求
 1. 所有节点安装相同版本的FastDeploy
 2. [建议安装]安装并配置MPI（OpenMPI或MPICH）
@@ -52,22 +51,21 @@
     ```
 
 * 注意：
-- 只有主节点可以接收完成请求
-- 请始终将请求发送到主节点（ips列表中的第一个IP）
-- 主节点将在所有节点间分配工作负载
+* 只有主节点可以接收完成请求
+* 请始终将请求发送到主节点（ips列表中的第一个IP）
+* 主节点将在所有节点间分配工作负载
 
 ### 参数说明
 
 #### `ips`参数
-- **类型**: `字符串`
-- **格式**: 逗号分隔的IPv4地址
-- **描述**: 指定部署组中所有节点的IP地址
-- **必填**: 仅多节点部署时需要
-- **示例**: `"192.168.1.101,192.168.1.102,192.168.1.103"`
+* **类型**: `字符串`
+* **格式**: 逗号分隔的IPv4地址
+* **描述**: 指定部署组中所有节点的IP地址
+* **必填**: 仅多节点部署时需要
+* **示例**: `"192.168.1.101,192.168.1.102,192.168.1.103"`
 
 #### `tensor_parallel_size`参数
-- **类型**: `整数`
-- **描述**: 所有节点上的GPU总数
-- **必填**: 是
-- **示例**: 对于2个节点各8个GPU，设置为16
-
+* **类型**: `整数`
+* **描述**: 所有节点上的GPU总数
+* **必填**: 是
+* **示例**: 对于2个节点各8个GPU，设置为16
diff --git a/docs/zh/get_started/installation/metax_gpu.md b/docs/zh/get_started/installation/metax_gpu.md
@@ -0,0 +1,82 @@
+# 使用 Metax GPU C550 运行ERNIE 4.5 系列模型
+
+FastDeploy在Metax C550上对ERNIE 4.5系列模型进行了深度适配和优化，实现了推理入口和GPU的统一，无需修改即可完成推理任务的迁移。
+
+环境准备：
+- Python >= 3.10
+- Linux X86_64
+
+| Chip Type | Driver Version | KMD Version |
+| :---: | :---: | :---: |
+| MetaX C550 | 3.0.0.1  | 2.14.6 |
+
+## 1. 容器镜像获取
+
+```shell
+docker login --username=cr_temp_user --password=eyJpbnN0YW5jZUlkIjoiY3JpLXpxYTIzejI2YTU5M3R3M2QiLCJ0aW1lIjoiMTc1NTUxODEwODAwMCIsInR5cGUiOiJzdWIiLCJ1c2VySWQiOiIyMDcwOTQwMTA1NjYzNDE3OTIifQ:8226ca50ce5476c42062e24d3c465545de1c1780 cr.metax-tech.com && docker pull cr.metax-tech.com/public-library/maca-native:3.0.0.4-ubuntu20.04-amd64
+```
+
+## 2. 预安装
+
+```shell
+1）pip install paddlepaddle==3.0.0.dev20250729 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
+2）pip install paddle-metax-gpu==3.0.0.dev20250807 -i https://www.paddlepaddle.org.cn/packages/nightly/maca/
+```
+
+## 3. FastDeploy代码下载并编译
+
+```shell
+git clone https://github.com/PaddlePaddle/FastDeploy
+cd FastDeploy
+bash build.sh
+```
+The built packages will be in the ```FastDeploy/dist``` directory.
+
+## 4. 环境验证
+
+After installation, verify the environment with this Python code:
+```python
+import paddle
+from paddle.jit.marker import unified
+# Verify GPU availability
+paddle.utils.run_check()
+# Verify FastDeploy custom operators compilation
+from fastdeploy.model_executor.ops.gpu import beam_search_softmax
+```
+If the above code executes successfully, the environment is ready.
+
+## 5. 示例
+from fastdeploy import LLM, SamplingParams
+
+prompts = [
+    "Hello. My name is",
+]
+
+sampling_params = SamplingParams(top_p=0.95, max_tokens=32, temperature=0.6)
+
+llm = LLM(model="/root/model/ERNIE-4.5-21B-A3B-Paddle", tensor_parallel_size=1, max_model_len=256, engine_worker_queue_port=9135, quantization='wint8', static_decode_blocks=0, gpu_memory_utilization=0.9)
+
+outputs = llm.generate(prompts, sampling_params)
+
+print(f"Generated {len(outputs)} outputs")
+print("=" * 50 + "\n")
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs.text
+    print(prompt)
+    print(generated_text)
+    print("-" * 50)
+
+输出：
+INFO     2025-08-18 10:54:18,455 416822 engine.py[line:202] Waiting worker processes ready...
+Loading Weights: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [03:33<00:00,  2.14s/it]
+Loading Layers: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:18<00:00,  5.54it/s]
+INFO     2025-08-18 10:58:16,149 416822 engine.py[line:247] Worker processes are launched with 240.08204197883606 seconds.
+Processed prompts: 100%|███████████████████████| 1/1 [00:21<00:00, 21.84s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
+Generated 1 outputs
+==================================================
+
+Hello. My name is
+Alice and I'm here to help you. What can I do for you today?
+Hello Alice! I'm trying to organize a small party
diff --git a/fastdeploy/engine/config.py b/fastdeploy/engine/config.py
@@ -282,7 +282,7 @@ def check(self):
             f"should be larger than or equal to max_num_seqs: {self.max_num_seqs}"
         )
         assert self.max_num_batched_tokens <= self.max_model_len * self.max_num_seqs, (
-            f"max_num_batched_tokens: {self.max_num_batched_tokens} should be larger"
+            f"max_num_batched_tokens: {self.max_num_batched_tokens} should be less"
             f"than or equal to max_num_seqs: {self.max_num_seqs} * max_model_len: {self.max_model_len}"
         )
         assert (

diff --git a/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py
@@ -257,6 +257,7 @@ def apply_rope(self, qk, cos, sin):
         out = paddle.add(paddle.multiply(qk, cos), paddle.multiply(rotate_half, sin))
         return paddle.cast(out, qk.dtype)
 
+    @paddle.no_grad()
     def forward_native_backend(
         self,
         q: paddle.Tensor,
@@ -273,7 +274,7 @@ def forward_native_backend(
         # 1. 分离 encoder / decoder 的 mask
         seq_lens_encoder = forward_meta.seq_lens_encoder.squeeze(-1)
         seq_lens_decoder = forward_meta.seq_lens_decoder.squeeze(-1)
-        seq_lens_this_time = forward_meta.seq_lens_this_time.squeeze(-1)
+        seq_lens_this_time = forward_meta.seq_lens_this_time
         encoder_indices = []
         decoder_indices = []