21
21
get_cur_cu_seq_len_k = None
22
22
import os
23
23
24
+ from fastdeploy .model_executor .layers .attention .attention import Attention
24
25
from fastdeploy .model_executor .layers .attention .moba_attention_backend import (
25
26
MobaAttentionBackend ,
26
27
)
28
+ from fastdeploy .platforms import _Backend , current_platform
27
29
28
30
29
31
class ModelConfig :
30
32
def __init__ (self ):
31
33
self .num_hidden_layers = 12
32
34
self .head_dim = 128
35
+ self .num_attention_heads = 8
36
+ self .num_key_value_heads = 1
33
37
34
38
35
39
class ParallelConfig :
36
40
def __init__ (self ):
37
41
self .block_size = 128
38
42
self .max_model_len = 128 * 1024
39
43
self .max_num_seqs = 1
44
+ self .tensor_parallel_size = 1
40
45
41
46
42
47
class ForwardMode :
@@ -48,16 +53,17 @@ class FDConfig:
48
53
def __init__ (self ):
49
54
self .parallel_config = ParallelConfig ()
50
55
self .model_config = ModelConfig ()
56
+ self .quant_config = {}
51
57
52
58
53
59
def test_moba_attention (seq_len , num_heads , num_kv_heads , head_dim ):
54
60
max_seq_len = int (128 * 1024 )
55
- moba_encoder_top_k_left = int (10 )
56
- moba_encoder_top_k_right = int (15 )
57
- moba_use_encoder_seq_limit = int (10 * 128 )
58
- moba_decoder_top_k_left = int (10 )
59
- moba_decoder_top_k_right = int (10 )
60
- moba_use_decoder_seq_limit = int (10 * 128 )
61
+ moba_encoder_top_k_left = int (5 )
62
+ moba_encoder_top_k_right = int (10 )
63
+ moba_use_encoder_seq_limit = int (20 * 128 )
64
+ moba_decoder_top_k_left = int (20 )
65
+ moba_decoder_top_k_right = int (20 )
66
+ moba_use_decoder_seq_limit = int (20 * 128 )
61
67
os .environ ["FD_ATTENTION_BACKEND" ] = "MOBA_ATTN"
62
68
os .environ ["FD_MOBA_MLP_WEIGHT_PATH" ] = "None"
63
69
os .environ ["FD_MOBA_ENCODER_TOP_K_LEFT" ] = str (moba_encoder_top_k_left )
@@ -70,7 +76,7 @@ def test_moba_attention(seq_len, num_heads, num_kv_heads, head_dim):
70
76
os .environ ["FD_MOBA_MAX_SEQ_LENGTH" ] = str (max_seq_len )
71
77
72
78
max_dec_len_this_time = int (0 )
73
- qkv = paddle .randn ([1 , seq_len , num_heads + 2 * num_kv_heads , head_dim ], dtype = "bfloat16" )
79
+ qkv = paddle .randn ([1 , 4 * seq_len , num_heads + 2 * num_kv_heads , head_dim ], dtype = "bfloat16" )
74
80
q_input = qkv [:, :, :num_heads , :].reshape ([- 1 , num_heads , head_dim ])
75
81
k_input = qkv [:, :, num_heads : num_heads + num_kv_heads , :].reshape ([- 1 , num_kv_heads , head_dim ])
76
82
v_input = qkv [:, :, num_heads + num_kv_heads :, :].reshape ([- 1 , num_kv_heads , head_dim ])
@@ -80,14 +86,14 @@ def test_moba_attention(seq_len, num_heads, num_kv_heads, head_dim):
80
86
81
87
seq_lens_decoder = paddle .to_tensor ([0 ], dtype = "int32" )
82
88
83
- cachesk = paddle .zeros ([(seq_len + 63 ) // 64 * 64 , num_kv_heads , 64 , head_dim ], dtype = "bfloat16" )
84
- cachesv = paddle .zeros ([(seq_len + 63 ) // 64 * 64 , num_kv_heads , 64 , head_dim ], dtype = "bfloat16" )
89
+ cachesk = paddle .zeros ([(seq_len + 63 ) // 64 * 256 , num_kv_heads , 64 , head_dim ], dtype = "bfloat16" )
90
+ cachesv = paddle .zeros ([(seq_len + 63 ) // 64 * 256 , num_kv_heads , 64 , head_dim ], dtype = "bfloat16" )
85
91
86
92
block_tables = paddle .arange ((seq_len + 63 ) // 64 ).astype ("int32" )
87
93
88
94
rotary_embs = paddle .ones ([seq_len , head_dim ], dtype = "float32" )
89
95
90
- cache_k_block_means = paddle .zeros ([(seq_len + 63 ) // 64 , num_kv_heads , 64 , head_dim ], dtype = "bfloat16" )
96
+ cache_k_block_means = paddle .zeros ([(seq_len + 63 ) // 64 + 10 , num_kv_heads , 64 , head_dim ], dtype = "bfloat16" )
91
97
92
98
fd_config = FDConfig ()
93
99
forward_meta = ForwardMode ()
@@ -98,6 +104,7 @@ def test_moba_attention(seq_len, num_heads, num_kv_heads, head_dim):
98
104
moba_attention_backend = MobaAttentionBackend (fd_config , 8 , 1 , 128 )
99
105
moba_attention_backend .init_attention_metadata (forward_meta )
100
106
moba_attention_backend .get_kv_cache_shape (100 )
107
+
101
108
if moba_attention is None :
102
109
return
103
110
if get_cur_cu_seq_len_k is None :
@@ -150,12 +157,17 @@ def test_moba_attention(seq_len, num_heads, num_kv_heads, head_dim):
150
157
"none" ,
151
158
)[0 ]
152
159
153
- return out
160
+ attention = Attention (fd_config , 0 )
161
+
162
+ selected_backend = _Backend .__members__ .get (os .environ ["FD_ATTENTION_BACKEND" ])
163
+ attention_cls = current_platform .get_attention_backend_cls (selected_backend )
164
+
165
+ return out , attention , attention_cls
154
166
155
167
156
168
if __name__ == "__main__" :
157
169
if paddle .is_compiled_with_cuda ():
158
- seq_len = int (20 * 1024 )
170
+ seq_len = int (2 * 1024 )
159
171
num_heads = int (8 )
160
172
num_kv_heads = int (1 )
161
173
head_dim = int (128 )
0 commit comments