Skip to content

Commit 354ac43

Browse files
authored
[pd-router] Add Configurable Retry Logic for reduce backend pressure (sgl-project#8744)
1 parent d98a491 commit 354ac43

File tree

10 files changed

+501
-292
lines changed

10 files changed

+501
-292
lines changed

sgl-router/src/config/types.rs

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ pub struct RouterConfig {
3939
pub max_concurrent_requests: usize,
4040
/// CORS allowed origins
4141
pub cors_allowed_origins: Vec<String>,
42+
/// Retry configuration
43+
pub retry: RetryConfig,
4244
}
4345

4446
/// Routing mode configuration
@@ -182,6 +184,30 @@ impl Default for DiscoveryConfig {
182184
}
183185
}
184186

187+
/// Retry configuration for request handling
188+
#[derive(Debug, Clone, Serialize, Deserialize)]
189+
pub struct RetryConfig {
190+
/// Maximum number of retry attempts
191+
pub max_retries: u32,
192+
/// Initial backoff delay in milliseconds
193+
pub initial_backoff_ms: u64,
194+
/// Maximum backoff delay in milliseconds
195+
pub max_backoff_ms: u64,
196+
/// Backoff multiplier for exponential backoff
197+
pub backoff_multiplier: f32,
198+
}
199+
200+
impl Default for RetryConfig {
201+
fn default() -> Self {
202+
Self {
203+
max_retries: 3,
204+
initial_backoff_ms: 100,
205+
max_backoff_ms: 10000,
206+
backoff_multiplier: 2.0,
207+
}
208+
}
209+
}
210+
185211
/// Metrics configuration
186212
#[derive(Debug, Clone, Serialize, Deserialize)]
187213
pub struct MetricsConfig {
@@ -210,7 +236,7 @@ impl Default for RouterConfig {
210236
host: "127.0.0.1".to_string(),
211237
port: 3001,
212238
max_payload_size: 268_435_456, // 256MB
213-
request_timeout_secs: 600,
239+
request_timeout_secs: 3600, // 1 hour to match Python mini LB
214240
worker_startup_timeout_secs: 300,
215241
worker_startup_check_interval_secs: 10,
216242
dp_aware: false,
@@ -222,6 +248,7 @@ impl Default for RouterConfig {
222248
request_id_headers: None,
223249
max_concurrent_requests: 64,
224250
cors_allowed_origins: vec![],
251+
retry: RetryConfig::default(),
225252
}
226253
}
227254
}
@@ -277,7 +304,7 @@ mod tests {
277304
assert_eq!(config.host, "127.0.0.1");
278305
assert_eq!(config.port, 3001);
279306
assert_eq!(config.max_payload_size, 268_435_456);
280-
assert_eq!(config.request_timeout_secs, 600);
307+
assert_eq!(config.request_timeout_secs, 3600);
281308
assert_eq!(config.worker_startup_timeout_secs, 300);
282309
assert_eq!(config.worker_startup_check_interval_secs, 10);
283310
assert!(config.discovery.is_none());
@@ -332,6 +359,7 @@ mod tests {
332359
request_id_headers: None,
333360
max_concurrent_requests: 64,
334361
cors_allowed_origins: vec![],
362+
retry: RetryConfig::default(),
335363
};
336364

337365
let json = serde_json::to_string(&config).unwrap();
@@ -759,6 +787,7 @@ mod tests {
759787
request_id_headers: None,
760788
max_concurrent_requests: 64,
761789
cors_allowed_origins: vec![],
790+
retry: RetryConfig::default(),
762791
};
763792

764793
assert!(config.mode.is_pd_mode());
@@ -810,6 +839,7 @@ mod tests {
810839
request_id_headers: None,
811840
max_concurrent_requests: 64,
812841
cors_allowed_origins: vec![],
842+
retry: RetryConfig::default(),
813843
};
814844

815845
assert!(!config.mode.is_pd_mode());
@@ -857,6 +887,7 @@ mod tests {
857887
request_id_headers: None,
858888
max_concurrent_requests: 64,
859889
cors_allowed_origins: vec![],
890+
retry: RetryConfig::default(),
860891
};
861892

862893
assert!(config.has_service_discovery());

sgl-router/src/lib.rs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ pub enum PolicyType {
1919
Random,
2020
RoundRobin,
2121
CacheAware,
22-
PowerOfTwo, // Moved from PD-specific, now shared
22+
PowerOfTwo,
2323
}
2424

2525
#[pyclass]
@@ -45,22 +45,18 @@ struct Router {
4545
selector: HashMap<String, String>,
4646
service_discovery_port: u16,
4747
service_discovery_namespace: Option<String>,
48-
// PD service discovery fields
4948
prefill_selector: HashMap<String, String>,
5049
decode_selector: HashMap<String, String>,
5150
bootstrap_port_annotation: String,
5251
prometheus_port: Option<u16>,
5352
prometheus_host: Option<String>,
5453
request_timeout_secs: u64,
5554
request_id_headers: Option<Vec<String>>,
56-
// PD mode flag
5755
pd_disaggregation: bool,
58-
// PD-specific fields (only used when pd_disaggregation is true)
5956
prefill_urls: Option<Vec<(String, Option<u16>)>>,
6057
decode_urls: Option<Vec<String>>,
6158
prefill_policy: Option<PolicyType>,
6259
decode_policy: Option<PolicyType>,
63-
// Additional server config fields
6460
max_concurrent_requests: usize,
6561
cors_allowed_origins: Vec<String>,
6662
}
@@ -150,6 +146,7 @@ impl Router {
150146
request_id_headers: self.request_id_headers.clone(),
151147
max_concurrent_requests: self.max_concurrent_requests,
152148
cors_allowed_origins: self.cors_allowed_origins.clone(),
149+
retry: config::RetryConfig::default(),
153150
})
154151
}
155152
}
@@ -289,7 +286,6 @@ impl Router {
289286
check_interval: std::time::Duration::from_secs(60),
290287
port: self.service_discovery_port,
291288
namespace: self.service_discovery_namespace.clone(),
292-
// PD mode configuration
293289
pd_mode: self.pd_disaggregation,
294290
prefill_selector: self.prefill_selector.clone(),
295291
decode_selector: self.decode_selector.clone(),

sgl-router/src/routers/factory.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ impl RouterFactory {
5050
ctx.router_config.worker_startup_check_interval_secs,
5151
ctx.router_config.dp_aware,
5252
ctx.router_config.api_key.clone(),
53+
ctx.router_config.retry.clone(),
5354
)?;
5455

5556
Ok(Box::new(router))
@@ -79,6 +80,7 @@ impl RouterFactory {
7980
ctx.client.clone(),
8081
ctx.router_config.worker_startup_timeout_secs,
8182
ctx.router_config.worker_startup_check_interval_secs,
83+
ctx.router_config.retry.clone(),
8284
)?;
8385

8486
Ok(Box::new(router))

0 commit comments

Comments
 (0)