@inproceedings{garnham-shareghi-2026-language,
title = "Could language models win the International Linguistics Olympiad?",
author = "Garnham, Jamie and
Shareghi, Ehsan",
editor = "Bonial, Claire and
Berzak, Yevgeni",
booktitle = "Proceedings of the 30th Conference on Computational Natural Language Learning",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.conll-main.28/",
pages = "481--500",
ISBN = "979-8-89176-410-1",
abstract = "Linguistic puzzles, wherein the solver must deduce rules of an unfamiliar language purely in-context, represent a uniquely perplexing problem format even for state-of-the-art large language models. Yet by exploring various inference-time scaling methods, we demonstrate that language models' performance on these problems can be improved without the need for fine-tuning or providing supplementary linguistic context. To this end, this paper introduces the first domain-specific inference-time scaling framework for linguistic puzzles, which we use to improve the performance of three model families - R1 (Deepseek), Gemini 2.5 Flash (Google), and Llama 3.3 70B Instruct (Meta) - on a challenging Linguistics Olympiad-based benchmark by 4.9, 13.1, and 4.9 percentage points, respectively. Nonetheless, even when multiple optimisations are applied, we find that LLMs' linguistic puzzle performance remains well below comparable mathematical and commonsense benchmarks, and we speculate as to why linguistic reasoning continues to pose a distinctive challenge for even the most capable large language models."
}Markdown (Informal)
[Could language models win the International Linguistics Olympiad?](https://preview.aclanthology.org/ingest-acl-workshops/2026.conll-main.28/) (Garnham & Shareghi, CoNLL 2026)
ACL