@inproceedings{haddadi-teahan-2026-comparing,
title = "Comparing Text Compression Capabilities of Large Language Models with Traditional Compression Algorithms",
author = "Haddadi, Mehran and
Teahan, William John",
editor = "Baez Santamaria, Selene and
Somayajula, Sai Ashish and
Yamaguchi, Atsuki",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 4: Student Research Workshop)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-srw.16/",
pages = "219--232",
ISBN = "979-8-89176-383-8",
abstract = "This work evaluates the non-English and unstructured text compression performance of Large Language Models (LLMs) by comparing them with traditional baselines on datasets from eight most widely spoken languages. Experimental results show that the evaluated LLM (LLaMA-3.2-1B) was considerably outperformed by the baselines, particularly on non-English datasets, where its performance relative to the best baseline was more than three times worse than on English datasets on average. It also compressed unstructured English data up to more than twofold less effectively than plain English data. Traditional methods, however, remained largely dataset-agnostic. Surprisingly, the LLM achieved worse compression ratios on some datasets than others despite modeling them more accurately. Overall, the outcomes and substantially higher compression time and resource consumption indicate that current LLMs are highly impractical for the compression task, where traditional methods continue to excel. Codes are available at: https://github.com/mehranhaddadi13/llm{\_}compress."
}Markdown (Informal)
[Comparing Text Compression Capabilities of Large Language Models with Traditional Compression Algorithms](https://preview.aclanthology.org/ingest-eacl/2026.eacl-srw.16/) (Haddadi & Teahan, EACL 2026)
ACL