@inproceedings{liu-2026-wordle,
title = "From Wordle to Fibble$^5$: Evaluating {LLM} Reasoning Under Escalating Deception",
author = "Liu, Chang",
editor = "Akhtar, Mubashara and
Batzner, Jan and
Choshen, Leshem and
Ghosh, Avijit and
Gohar, Usman and
Mickel, Jennifer and
Pant, Ichhya and
Talat, Zeerak and
Lin, Michelle",
booktitle = "Proceedings of the Workshop on Evaluating Evaluations ({E}val{E}val)",
month = jul,
year = "2026",
address = "San Diego, CA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.evaleval-1.5/",
pages = "36--45",
ISBN = "979-8-89176-429-3",
abstract = "Standard benchmarks for large language models (LLMs) assume that task feedback is truthful, but real-world reasoning often requires processing unreliable or adversarial information. We introduce WordleArenas, a benchmark platform that evaluates LLM reasoning robustness across a deception gradient. Building on Wordle and its deceptive variant Fibble (Chusap et al., 2025), we generalize to Fibblek (k = 0, . . . , 5 lies per row), creating a controlled evaluation of LLM robustness to misinformation. Across six arenas {---} standard Wordle (0 lies per row) through Fibble5 (5 lies per row) {---} we evaluate 41 models from 10 providers across 3,749 games. We find that (1) even one lie per row causes catastrophic performance drops (average win rate falls from 41.1{\%} to 18.7{\%}), (2) a sharp deception cliff emerges at 2{--}3 lies where nearly all models collapse to {\ensuremath{\leq}}3{\%} win rate, and (3) model robustness to deception is poorly predicted by standard benchmark rankings. A surprising Fibble5 recovery emerges: some models recover partial performance when all feedback lies (average 9.5{\%}), outperforming Fibble3 (0.3{\%}) and Fibble4 (0.4{\%}), because knowing that every tile lies restores deterministic {---} though partial {---} information. Our results demonstrate that truthful-feedback evaluations systematically overestimate LLM reasoning capabilities and that deception-aware benchmarks are essential for assessing real-world robustness. All code and data are publicly available."
}Markdown (Informal)
[From Wordle to Fibble5: Evaluating LLM Reasoning Under Escalating Deception](https://preview.aclanthology.org/ingest-acl-workshops/2026.evaleval-1.5/) (Liu, EvalEval 2026)
ACL