@inproceedings{loredo-lopez-etal-2025-nyt,
title = "{NYT}-Connections: A Deceptively Simple Text Classification Task that Stumps System-1 Thinkers",
author = "Loredo Lopez, Angel Yahir and
McDonald, Tyler and
Emami, Ali",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.coling-main.134/",
pages = "1952--1963",
abstract = "Large Language Models (LLMs) have shown impressive performance on various benchmarks, yet their ability to engage in deliberate reasoning remains questionable. We present NYT-Connections, a collection of 358 simple word classification puzzles derived from the New York Times Connections game. This benchmark is designed to penalize quick, intuitive ``System 1'' thinking, isolating fundamental reasoning skills. We evaluated six recent LLMs, a simple machine learning heuristic, and humans across three configurations: single-attempt, multiple attempts without hints, and multiple attempts with contextual hints. Our findings reveal a significant performance gap: even top-performing LLMs like GPT-4 fall short of human performance by nearly 30{\%}. Notably, advanced prompting techniques such as Chain-of-Thought and Self-Consistency show diminishing returns as task difficulty increases. NYT-Connections uniquely combines linguistic isolation, resistance to intuitive shortcuts, and regular updates to mitigate data leakage, offering a novel tool for assessing LLM reasoning capabilities."
}
Markdown (Informal)
[NYT-Connections: A Deceptively Simple Text Classification Task that Stumps System-1 Thinkers](https://preview.aclanthology.org/fix-sig-urls/2025.coling-main.134/) (Loredo Lopez et al., COLING 2025)
ACL