@inproceedings{stripelis-etal-2024-tensoropera, title = "{T}ensor{O}pera Router: A Multi-Model Router for Efficient {LLM} Inference", author = "Stripelis, Dimitris and Xu, Zhaozhuo and Hu, Zijian and Shah, Alay Dilipbhai and Jin, Han and Yao, Yuhang and Zhang, Jipeng and Zhang, Tong and Avestimehr, Salman and He, Chaoyang", editor = "Dernoncourt, Franck and Preo{\c{t}}iuc-Pietro, Daniel and Shimorina, Anastasia", booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track", month = nov, year = "2024", address = "Miami, Florida, US", publisher = "Association for Computational Linguistics", url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.emnlp-industry.34/", doi = "10.18653/v1/2024.emnlp-industry.34", pages = "452--462" }