|
| 1 | +@inproceedings{namyar2025mitigation, |
| 2 | + title={Enhancing Network Failure Mitigation with Performance-Aware Ranking}, |
| 3 | + author={Namyar, Pooria and Ghavidel, Arvin and Crankshaw, Daniel and Berger, Daniel S and Hsieh, Kevin and Kandula, Srikanth and Govindan, Ramesh and Arzani, Behnaz}, |
| 4 | + booktitle = {22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25)}, |
| 5 | + year={2025}, |
| 6 | + abbr={NSDI}, |
| 7 | + url={https://arxiv.org/abs/2305.13792}, |
| 8 | + abstract={Cloud providers install mitigations to reduce the impact of network failures in their datacenters. To determine the best action, existing automatic network mitigation systems rely on simple local criteria or global proxy metrics. In this paper, we show that we can explicitly optimize end-to-end flow-level metrics and analyze actions holistically to support a broader range of actions and select much more effective mitigations. To this end, we develop novel techniques to quickly estimate the impact of different mitigations and rank them with high fidelity. Our results on incidents from a large cloud provider show orders of magnitude improvements in flow completion time and throughput. We also show our approach scales to large datacenters.} |
| 9 | +} |
| 10 | + |
1 | 11 | @inproceedings{namyar2024metaopt,
|
2 | 12 | author = {Pooria Namyar and Behnaz Arzani and Ryan Beckett and Santiago Segarra and Himanshu Raj and Umesh Krishnaswamy and Ramesh Govindan and Srikanth Kandula},
|
3 | 13 | title = {Finding Adversarial Inputs for Heuristics using Multi-level Optimization},
|
@@ -30,16 +40,6 @@ @inproceedings{namyar2024maxminfair
|
30 | 40 | code = {https://github.com/microsoft/Soroush}
|
31 | 41 | }
|
32 | 42 |
|
33 |
| -@article{namyar2023mitigation, |
34 |
| - title={Mitigating the Performance Impact of Network Failures in Public Clouds}, |
35 |
| - author={Namyar, Pooria and Arzani, Behnaz and Crankshaw, Daniel and Berger, Daniel S and Hsieh, Kevin and Kandula, Srikanth and Govindan, Ramesh}, |
36 |
| - journal={arXiv preprint arXiv:2305.13792}, |
37 |
| - year={2023}, |
38 |
| - abbr={Preprint}, |
39 |
| - url={https://arxiv.org/abs/2305.13792}, |
40 |
| - abstract={Some faults in data center networks require hours to days to repair because they may need reboots, re-imaging, or manual work by technicians. To reduce traffic impact, cloud providers mitigate the effect of faults, for example, by steering traffic to alternate paths. The state-of-art in automatic network mitigations uses simple safety checks and proxy metrics to determine mitigations. SWARM, the approach described in this paper, can pick orders of magnitude better mitigations by estimating end-to-end connection-level performance (CLP) metrics. At its core is a scalable CLP estimator that quickly ranks mitigations with high fidelity and, on failures observed at a large cloud provider, outperforms the state-of-the-art by over 700$\times$ in some cases.} |
41 |
| -} |
42 |
| - |
43 | 43 | @article{Chitavis2023,
|
44 | 44 | author={Chitavisutthivong, Kanatip and Supittayapornpong, Sucha and Namyar, Pooria and Zhang, Mingyang and Yu, Minlan and Govindan, Ramesh},
|
45 | 45 | journal={IEEE/ACM Transactions on Networking},
|
|
0 commit comments