Skip to content

Commit d09a51f

Browse files
[feat&refactor] Enhance multimodal input support with refactor io_struct (sgl-project#4938)
Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
1 parent f8194b2 commit d09a51f

File tree

4 files changed

+810
-103
lines changed

4 files changed

+810
-103
lines changed

python/sglang/srt/entrypoints/engine.py

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
import zmq
3131
import zmq.asyncio
32+
from PIL.Image import Image
3233

3334
# Fix a bug of Python threading
3435
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
@@ -135,9 +136,19 @@ def generate(
135136
sampling_params: Optional[Union[List[Dict], Dict]] = None,
136137
# The token ids for text; one can either specify text or input_ids.
137138
input_ids: Optional[Union[List[List[int]], List[int]]] = None,
138-
# The image input. It can be a file name, a url, or base64 encoded string.
139-
# See also python/sglang/srt/utils.py:load_image.
140-
image_data: Optional[Union[List[str], str]] = None,
139+
# The image input. It can be an image instance, file name, URL, or base64 encoded string.
140+
# Can be formatted as:
141+
# - Single image for a single request
142+
# - List of images (one per request in a batch)
143+
# - List of lists of images (multiple images per request)
144+
# See also python/sglang/srt/utils.py:load_image for more details.
145+
image_data: Optional[
146+
Union[
147+
List[List[Union[Image, str]]],
148+
List[Union[Image, str]],
149+
Union[Image, str],
150+
]
151+
] = None,
141152
return_logprob: Optional[Union[List[bool], bool]] = False,
142153
logprob_start_len: Optional[Union[List[int], int]] = None,
143154
top_logprobs_num: Optional[Union[List[int], int]] = None,
@@ -190,9 +201,19 @@ async def async_generate(
190201
sampling_params: Optional[Union[List[Dict], Dict]] = None,
191202
# The token ids for text; one can either specify text or input_ids.
192203
input_ids: Optional[Union[List[List[int]], List[int]]] = None,
193-
# The image input. It can be a file name, a url, or base64 encoded string.
194-
# See also python/sglang/srt/utils.py:load_image.
195-
image_data: Optional[Union[List[str], str]] = None,
204+
# The image input. It can be an image instance, file name, URL, or base64 encoded string.
205+
# Can be formatted as:
206+
# - Single image for a single request
207+
# - List of images (one per request in a batch)
208+
# - List of lists of images (multiple images per request)
209+
# See also python/sglang/srt/utils.py:load_image for more details.
210+
image_data: Optional[
211+
Union[
212+
List[List[Union[Image, str]]],
213+
List[Union[Image, str]],
214+
Union[Image, str],
215+
]
216+
] = None,
196217
return_logprob: Optional[Union[List[bool], bool]] = False,
197218
logprob_start_len: Optional[Union[List[int], int]] = None,
198219
top_logprobs_num: Optional[Union[List[int], int]] = None,
@@ -228,7 +249,13 @@ async def async_generate(
228249
def encode(
229250
self,
230251
prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
231-
image_data: Optional[Union[List[str], str]] = None,
252+
image_data: Optional[
253+
Union[
254+
List[List[Union[Image, str]]],
255+
List[Union[Image, str]],
256+
Union[Image, str],
257+
]
258+
] = None,
232259
) -> Dict:
233260
"""
234261
The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.

python/sglang/srt/entrypoints/verl_engine.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import torch
1818
import torch.distributed as dist
19+
from PIL.Image import Image
1920
from torch.distributed.tensor import DeviceMesh, DTensor
2021

2122
from sglang.srt.model_executor.model_runner import LocalSerializedTensor
@@ -56,9 +57,19 @@ def generate(
5657
sampling_params: Optional[Union[List[Dict], Dict]] = None,
5758
# The token ids for text; one can either specify text or input_ids.
5859
input_ids: Optional[Union[List[List[int]], List[int]]] = None,
59-
# The image input. It can be a file name, a url, or base64 encoded string.
60-
# See also python/sglang/srt/utils.py:load_image.
61-
image_data: Optional[Union[List[str], str]] = None,
60+
# The image input. It can be an image instance, file name, URL, or base64 encoded string.
61+
# Can be formatted as:
62+
# - Single image for a single request
63+
# - List of images (one per request in a batch)
64+
# - List of lists of images (multiple images per request)
65+
# See also python/sglang/srt/utils.py:load_image for more details.
66+
image_data: Optional[
67+
Union[
68+
List[List[Union[Image, str]]],
69+
List[Union[Image, str]],
70+
Union[Image, str],
71+
]
72+
] = None,
6273
return_logprob: Optional[Union[List[bool], bool]] = False,
6374
logprob_start_len: Optional[Union[List[int], int]] = None,
6475
top_logprobs_num: Optional[Union[List[int], int]] = None,

0 commit comments

Comments
 (0)