@inproceedings{yao-etal-2024-scalellm, title = "{S}cale{LLM}: A Resource-Frugal {LLM} Serving Framework by Optimizing End-to-End Efficiency", author = "Yao, Yuhang and Jin, Han and Shah, Alay Dilipbhai and Han, Shanshan and Hu, Zijian and Stripelis, Dimitris and Ran, Yide and Xu, Zhaozhuo and Avestimehr, Salman and He, Chaoyang", editor = "Dernoncourt, Franck and Preo{\c{t}}iuc-Pietro, Daniel and Shimorina, Anastasia", booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track", month = nov, year = "2024", address = "Miami, Florida, US", publisher = "Association for Computational Linguistics", url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.emnlp-industry.22/", doi = "10.18653/v1/2024.emnlp-industry.22", pages = "279--289" }