@inproceedings{ouyang-2025-treecut,
title = "{T}ree{C}ut: A Synthetic Unanswerable Math Word Problem Dataset for {LLM} Hallucination Evaluation",
author = "Ouyang, Jialin",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.acl-short.84/",
pages = "1073--1085",
ISBN = "979-8-89176-252-7",
abstract = "Large language models (LLMs) now achieve near-human performance on standard math word problem benchmarks (e.g., GSM8K), yet their true reasoning ability remains disputed. A key concern is that models often produce confident, yet unfounded, answers to unanswerable problems. We introduce TreeCut, a synthetic dataset that systematically generates infinite unanswerable math word problems and their answerable counterparts, by representing each question as a tree and removing chosen necessary conditions. Experiments show TreeCut effectively induce hallucinations in large language models, including GPT-4o and o3-mini, with rates of 64{\%} and 44{\%} in their respective worst-case scenarios under zero-shot setting. Further analysis highlights that deeper or more complex trees, composite item names, and removing necessary condition near the middle of a path all increase the likelihood of hallucinations, underscoring the persistent challenges LLMs face in identifying unanswerable math problems. The dataset generation code and sample data are available at \url{https://github.com/j-bagel/treecut-math}."
}
Markdown (Informal)
[TreeCut: A Synthetic Unanswerable Math Word Problem Dataset for LLM Hallucination Evaluation](https://preview.aclanthology.org/landing_page/2025.acl-short.84/) (Ouyang, ACL 2025)
ACL