Skip to content

Commit bd7cfbd

Browse files
authored
[Fix] Reduce busy polling when scheduler is idle (sgl-project#6026)
1 parent 4b9971e commit bd7cfbd

File tree

5 files changed

+48
-1
lines changed

5 files changed

+48
-1
lines changed

docs/backend/server_arguments.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
107107
| `--download-dir` | Model download directory for huggingface. | None |
108108
| `--base-gpu-id` | The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine. | 0 |
109109
| `--gpu-id-step` | The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,.... | 1 |
110-
110+
| `--sleep-on-idle` | Reduce CPU usage when sglang is idle. | False |
111111

112112
## Logging
113113

python/sglang/srt/disaggregation/decode.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -550,6 +550,7 @@ def event_loop_normal_disagg_decode(self: Scheduler):
550550
# When the server is idle, do self-check and re-init some states
551551
self.check_memory()
552552
self.new_token_ratio = self.init_new_token_ratio
553+
self.maybe_sleep_on_idle()
553554

554555
self.last_batch = batch
555556

@@ -628,6 +629,7 @@ def event_loop_overlap_disagg_decode(self: Scheduler):
628629
# When the server is idle, do self-check and re-init some states
629630
self.check_memory()
630631
self.new_token_ratio = self.init_new_token_ratio
632+
self.maybe_sleep_on_idle()
631633

632634
self.last_batch = batch
633635
self.last_batch_in_queue = last_batch_in_queue

python/sglang/srt/disaggregation/prefill.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ def event_loop_normal_disagg_prefill(self: Scheduler):
242242
if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
243243
self.check_memory()
244244
self.new_token_ratio = self.init_new_token_ratio
245+
self.maybe_sleep_on_idle()
245246

246247
self.last_batch = batch
247248
# HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
@@ -294,6 +295,7 @@ def event_loop_overlap_disagg_prefill(self: Scheduler):
294295
if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
295296
self.check_memory()
296297
self.new_token_ratio = self.init_new_token_ratio
298+
self.maybe_sleep_on_idle()
297299

298300
self.last_batch = batch
299301
# HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it

python/sglang/srt/managers/scheduler.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,27 @@ class EmbeddingBatchResult:
179179
bid: int
180180

181181

182+
class IdleSleeper:
183+
"""
184+
In setups which have long inactivity periods it is desirable to reduce
185+
system power consumption when sglang does nothing. This would lead not only
186+
to power savings, but also to more CPU thermal headroom when a request
187+
eventually comes. This is important in cases when multiple GPUs are connected
188+
as each GPU would otherwise pin one thread at 100% CPU usage.
189+
190+
The simplest solution is to use zmq.Poller on all sockets that may receive
191+
data that needs handling immediately.
192+
"""
193+
194+
def __init__(self, sockets):
195+
self.poller = zmq.Poller()
196+
for s in sockets:
197+
self.poller.register(s, zmq.POLLIN)
198+
199+
def maybe_sleep(self):
200+
self.poller.poll(1000)
201+
202+
182203
class Scheduler(
183204
SchedulerOutputProcessorMixin,
184205
SchedulerDisaggregationDecodeMixin,
@@ -228,6 +249,8 @@ def __init__(
228249

229250
# Init inter-process communication
230251
context = zmq.Context(2)
252+
self.idle_sleeper = None
253+
231254
if self.pp_rank == 0 and self.attn_tp_rank == 0:
232255
self.recv_from_tokenizer = get_zmq_socket(
233256
context, zmq.PULL, port_args.scheduler_input_ipc_name, False
@@ -250,6 +273,13 @@ def __init__(
250273
self.recv_from_rpc = get_zmq_socket(
251274
context, zmq.DEALER, port_args.rpc_ipc_name, False
252275
)
276+
if self.server_args.sleep_on_idle:
277+
self.idle_sleeper = IdleSleeper(
278+
[
279+
self.recv_from_tokenizer,
280+
self.recv_from_rpc,
281+
]
282+
)
253283
else:
254284
self.recv_from_tokenizer = None
255285
self.recv_from_rpc = None
@@ -478,6 +508,10 @@ def __init__(
478508
)
479509
self.init_disaggregation()
480510

511+
def maybe_sleep_on_idle(self):
512+
if self.idle_sleeper is not None:
513+
self.idle_sleeper.maybe_sleep()
514+
481515
def init_tokenizer(self):
482516
server_args = self.server_args
483517

@@ -667,6 +701,7 @@ def event_loop_normal(self):
667701
# When the server is idle, do self-check and re-init some states
668702
self.check_memory()
669703
self.new_token_ratio = self.init_new_token_ratio
704+
self.maybe_sleep_on_idle()
670705

671706
self.last_batch = batch
672707

@@ -711,6 +746,7 @@ def event_loop_overlap(self):
711746
# When the server is idle, do self-check and re-init some states
712747
self.check_memory()
713748
self.new_token_ratio = self.init_new_token_ratio
749+
self.maybe_sleep_on_idle()
714750

715751
self.last_batch = batch
716752

@@ -816,6 +852,7 @@ def event_loop_pp(self):
816852
if server_is_idle:
817853
self.check_memory()
818854
self.new_token_ratio = self.init_new_token_ratio
855+
self.maybe_sleep_on_idle()
819856

820857
def recv_requests(self) -> List[Req]:
821858
"""Receive results at tp_rank = 0 and broadcast it to all other TP ranks."""

python/sglang/srt/server_args.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ class ServerArgs:
9090
download_dir: Optional[str] = None
9191
base_gpu_id: int = 0
9292
gpu_id_step: int = 1
93+
sleep_on_idle: bool = False
9394

9495
# Logging
9596
log_level: str = "info"
@@ -844,6 +845,11 @@ def add_cli_args(parser: argparse.ArgumentParser):
844845
default=ServerArgs.gpu_id_step,
845846
help="The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,...",
846847
)
848+
parser.add_argument(
849+
"--sleep-on-idle",
850+
action="store_true",
851+
help="Reduce CPU usage when sglang is idle.",
852+
)
847853

848854
# Logging
849855
parser.add_argument(

0 commit comments

Comments
 (0)