@inproceedings{sy-etal-2025-efficient,
title = "Efficient One-shot Compression via Low-Rank Local Feature Distillation",
author = "Sy, Yaya and
Cerisara, Christophe and
Illina, Irina",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.naacl-long.291/",
pages = "5643--5661",
ISBN = "979-8-89176-189-6",
abstract = "Current structured pruning approaches for large language models typically involve two steps: (1) compression using calibration data and (2) costly continued pretraining on billions of tokens to recover lost performance. This second step is necessary as the first significantly impacts model accuracy. Moreover, prior research suggests that pretrained Transformer weights are not necessarily low-rank, unlike their activations, making one-shot structured pruning challenging. Based on this observation, we propose Lillama, a compression method that locally distills activations with low-rank weights. Using SVD for initialization and a joint loss combining teacher and student activations, we accelerate convergence and reduce memory use with local gradient updates. Lillama compresses Mixtral-8x7B within minutes on a single A100 GPU, removing 10 billion parameters while retaining over 95{\%} of its original performance. Phi-2 3B can be compressed by 40{\%} with just 13 million calibration tokens, resulting in a small model that competes with recent models of similar size. The method generalizes well to non-transformer architectures, compressing Mamba-3B by 20{\%} while maintaining 99{\%} performance."
}
Markdown (Informal)
[Efficient One-shot Compression via Low-Rank Local Feature Distillation](https://preview.aclanthology.org/landing_page/2025.naacl-long.291/) (Sy et al., NAACL 2025)
ACL
- Yaya Sy, Christophe Cerisara, and Irina Illina. 2025. Efficient One-shot Compression via Low-Rank Local Feature Distillation. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 5643–5661, Albuquerque, New Mexico. Association for Computational Linguistics.