@inproceedings{yao-etal-2024-scalellm,
    title = "{S}cale{LLM}: A Resource-Frugal {LLM} Serving Framework by Optimizing End-to-End Efficiency",
    author = "Yao, Yuhang  and
      Jin, Han  and
      Shah, Alay Dilipbhai  and
      Han, Shanshan  and
      Hu, Zijian  and
      Stripelis, Dimitris  and
      Ran, Yide  and
      Xu, Zhaozhuo  and
      Avestimehr, Salman  and
      He, Chaoyang",
    editor = "Dernoncourt, Franck  and
      Preo{\c{t}}iuc-Pietro, Daniel  and
      Shimorina, Anastasia",
    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
    month = nov,
    year = "2024",
    address = "Miami, Florida, US",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.emnlp-industry.22/",
    doi = "10.18653/v1/2024.emnlp-industry.22",
    pages = "279--289"
}