@inproceedings{mikhaylovskiy-2025-zipfs,
title = "{Z}ipf{'}s and Heaps' Laws for Tokens and {LLM}-generated Texts",
author = "Mikhaylovskiy, Nikolay",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-luhme/2025.findings-emnlp.837/",
doi = "10.18653/v1/2025.findings-emnlp.837",
pages = "15469--15481",
ISBN = "979-8-89176-335-7",
abstract = "The frequency distribution of words in human-written texts roughly follows a simple mathematical form known as Zipf{'}s law. Somewhat less well known is the related Heaps' law, which describes a sublinear power-law growth of vocabulary size with document size. We study the applicability of Zipf{'}s and Heaps' laws to texts generated by Large Language Models (LLMs). We empirically show that Heaps' and Zipf{'}s laws only hold for LLM-generated texts in a narrow model-dependent temperature range. These temperatures have an optimal value close to $t=1$ for all the base models except the large Llama models, are higher for instruction-finetuned models and do not depend on the model size or prompting. This independently confirms the recent discovery of sampling temperature dependent phase transitions in LLM-generated texts."
}Markdown (Informal)
[Zipf’s and Heaps’ Laws for Tokens and LLM-generated Texts](https://preview.aclanthology.org/ingest-luhme/2025.findings-emnlp.837/) (Mikhaylovskiy, Findings 2025)
ACL