Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/sglang/srt/configs/deepseekvl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def format_messages_v2(self, messages, pil_images, max_req_input_len=-1):
tokenized_str, images, seq_mask, spatial_crop = self.tokenize_with_images(
messages,
pil_images[image_index : image_index + image_token_cnt],
bos=False,
bos=True,
eos=True,
cropping=len(pil_images) <= 2,
max_req_input_len=max_req_input_len,
Expand Down
4 changes: 3 additions & 1 deletion python/sglang/srt/configs/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,9 @@ def __init__(
self.attention_arch = AttentionArch.MLA
self.kv_lora_rank = self.hf_config.kv_lora_rank
self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures:
elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures and getattr(
self.hf_text_config, "use_mla", True
):
self.head_dim = 256
self.attention_arch = AttentionArch.MLA
self.kv_lora_rank = self.hf_text_config.kv_lora_rank
Expand Down
35 changes: 34 additions & 1 deletion python/sglang/srt/conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,30 @@ def generate_embedding_convs(
return convs


# Models in which system adds modality tokens at prompt start automatically
# when media inputs exceed modality tokens in prompt (e.g. 3 images but 2 <image> tokens)
_MODELS_REQUIRING_MODALITY_SUPPLEMENT = {"deepseek-vl2"}


# adapted from https://github.com/vllm-project/vllm/blob/5124f5bf51b83e6f344c1bc6652e8c4d81313b34/vllm/entrypoints/chat_utils.py#L856
def _get_full_multimodal_text_prompt(
modality_token: str, modality_count: int, text_prompt: str
) -> str:
"""Combine multimodal prompts for a multimodal language model."""

# For any existing placeholder in the text prompt, we leave it as is
left: int = modality_count - text_prompt.count(modality_token)
if left < 0:
raise ValueError(
f"Found more '{modality_token}' placeholders in input prompt than "
"actual multimodal data items."
)

# NOTE: For now we always add missing modality_token at the front of
# the prompt. This may change to be customizable in the future.
return "\n".join([modality_token] * left + [text_prompt])


def generate_chat_conv(
request: ChatCompletionRequest, template_name: str
) -> Conversation:
Expand Down Expand Up @@ -520,6 +544,12 @@ def generate_chat_conv(
if conv.name != "qwen2-vl"
else conv.image_token
)
add_token_as_needed: bool = (
conv.name in _MODELS_REQUIRING_MODALITY_SUPPLEMENT
)
if add_token_as_needed:
image_token = ""

audio_token = conv.audio_token
for content in message.content:
if content.type == "text":
Expand All @@ -533,7 +563,10 @@ def generate_chat_conv(
elif content.type == "audio_url":
real_content += audio_token
conv.append_audio(content.audio_url.url)

if add_token_as_needed:
real_content = _get_full_multimodal_text_prompt(
conv.image_token, num_image_url, real_content
)
conv.append_message(conv.roles[0], real_content)
elif msg_role == "assistant":
parsed_content = ""
Expand Down
14 changes: 12 additions & 2 deletions python/sglang/srt/models/deepseek.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,8 +382,14 @@ def forward(
input_ids: torch.Tensor,
positions: torch.Tensor,
forward_batch: ForwardBatch,
input_embeds: torch.Tensor = None,
) -> torch.Tensor:
hidden_states = self.embed_tokens(input_ids)

if input_embeds is None:
hidden_states = self.embed_tokens(input_ids)
else:
hidden_states = input_embeds

residual = None
for i in range(len(self.layers)):
layer = self.layers[i]
Expand Down Expand Up @@ -416,14 +422,18 @@ def __init__(
)
self.logits_processor = LogitsProcessor(config)

def get_input_embeddings(self) -> nn.Embedding:
return self.model.embed_tokens

@torch.no_grad()
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
forward_batch: ForwardBatch,
input_embeds: torch.Tensor = None,
) -> torch.Tensor:
hidden_states = self.model(input_ids, positions, forward_batch)
hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
return self.logits_processor(
input_ids, hidden_states, self.lm_head, forward_batch
)
Expand Down
7 changes: 6 additions & 1 deletion python/sglang/srt/models/deepseek_vl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
from sglang.srt.model_loader.weight_utils import default_weight_loader
from sglang.srt.models.deepseek import DeepseekForCausalLM
from sglang.srt.models.deepseek_v2 import DeepseekV2ForCausalLM


Expand Down Expand Up @@ -189,7 +190,11 @@ def __init__(

# ----------- language model ------------
language_config = config.language_config
self.language_model = DeepseekV2ForCausalLM(language_config)
if language_config.use_mla:
self.language_model = DeepseekV2ForCausalLM(language_config)
else:
# deepseek-vl2-tiny forbids mla
self.language_model = DeepseekForCausalLM(language_config)

def _init_vision_module(
self, vision_config, quant_config: Optional[QuantizationConfig]
Expand Down
24 changes: 24 additions & 0 deletions test/srt/test_vision_openai_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,30 @@ def test_video_chat_completion(self):
pass


class TestDeepseekVL2TinyServer(TestOpenAIVisionServer):
@classmethod
def setUpClass(cls):
cls.model = "deepseek-ai/deepseek-vl2-tiny"
cls.base_url = DEFAULT_URL_FOR_TEST
cls.api_key = "sk-123456"
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--trust-remote-code",
"--chat-template",
"deepseek-vl2",
"--context-length",
"4096",
],
)
cls.base_url += "/v1"

def test_video_chat_completion(self):
pass


class TestJanusProServer(TestOpenAIVisionServer):
@classmethod
def setUpClass(cls):
Expand Down
Loading