@inproceedings{behdin-etal-2025-scaling, title = "Scaling Down, Serving Fast: Compressing and Deploying Efficient {LLM}s for Recommendation Systems", author = "Behdin, Kayhan and Fatahibaarzi, Ata and Song, Qingquan and Dai, Yun and Gupta, Aman and Wang, Zhipeng and Sang, Hejian and Tang, Shao and Dexter, Gregory and Zhu, Sirou and Zhu, Siyu and Dharamsi, Tejas and Kothapalli, Vignesh and Fu, Zhoutong and Cao, Yihan and Hsu, Pin-Lun and Borisyuk, Fedor and Pillai, Natesh S. and Simon, Luke and Mazumder, Rahul", editor = "Potdar, Saloni and Rojas-Barahona, Lina and Montella, Sebastien", booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track", month = nov, year = "2025", address = "Suzhou (China)", publisher = "Association for Computational Linguistics", url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-industry.119/", pages = "1687--1702", ISBN = "979-8-89176-333-3" }