@inproceedings{qin-etal-2025-data,
title = "Data Drives Unstable Hierarchical Generalization in {LM}s",
author = "Qin, Tian and
Saphra, Naomi and
Alvarez-Melis, David",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.emnlp-main.593/",
doi = "10.18653/v1/2025.emnlp-main.593",
pages = "11733--11751",
ISBN = "979-8-89176-332-6",
abstract = "Early in training, LMs can behave like n-gram models, but eventually, they often learn tree-based syntactic rules and generalize hierarchically out of distribution (OOD). We study this shift using controlled grammar-learning tasks: question formation and tense inflection. We find a model learns to generalize hierarchically if its training data is *complex*{--}in particular, if it includes center-embedded clauses, a special syntactic structure. Under this definition, complex data drives hierarchical rules, while less complex data encourages shortcut learning in the form of n-gram-like linear rules. Furthermore, we find that a model uses rules to generalize, whether hierarchical or linear, if its training data is *diverse*{--}in particular, if it includes many distinct syntax trees in the training set. Under this definition, diverse data promotes stable rule learning, whereas less diverse data promotes memorization of individual syntactic sequences. Finally, intermediate diversity and intermediate complexity form an *unstable regime*, which is characterized by oscillatory learning dynamics and inconsistent behaviors across random seeds. These results highlight the central role of training data in shaping generalization and explain why competing strategies can lead to unstable outcomes."
}Markdown (Informal)
[Data Drives Unstable Hierarchical Generalization in LMs](https://preview.aclanthology.org/name-variant-enfa-fane/2025.emnlp-main.593/) (Qin et al., EMNLP 2025)
ACL
- Tian Qin, Naomi Saphra, and David Alvarez-Melis. 2025. Data Drives Unstable Hierarchical Generalization in LMs. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing, pages 11733–11751, Suzhou, China. Association for Computational Linguistics.