@inproceedings{suzuki-etal-2025-anatom,
title = "{A}na{T}o{M}: A Dataset Generation Framework for Evaluating Theory of Mind Reasoning Toward the Anatomy of Difficulty through Structurally Controlled Story Generation",
author = "Suzuki, Jundai and
Ishigaki, Ryoma and
Maeda, Eisaku",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.14/",
pages = "244--257",
ISBN = "979-8-89176-303-6",
abstract = "Evaluating Theory of Mind (ToM) in Large Language Models (LLMs) is an important area of research for understanding the social intelligence of AI. Recent ToM benchmarks have made significant strides in enhancing the complexity, comprehensiveness, and practicality of evaluation. However, while the focus has been on constructing ``more difficult'' or ``more comprehensive'' tasks, there has been insufficient systematic analysis of the structural factors that inherently determine the difficulty of ToM reasoning{---}that is, ``what'' makes reasoning difficult. To address this challenge, we propose a new dataset generation framework for ToM evaluation named AnaToM. To realize an ``Anatomy of Difficulty'' in ToM reasoning, AnaToM strictly controls structural parameters such as the number of entities and the timeline in a story. This parameter control enables the isolation and identification of factors affecting the ToM of LLMs, allowing for a more precise examination of their reasoning mechanisms. The proposed framework provides a systematic methodology for diagnosing the limits of LLM reasoning abilities and offers new guidelines for future benchmark design."
}Markdown (Informal)
[AnaToM: A Dataset Generation Framework for Evaluating Theory of Mind Reasoning Toward the Anatomy of Difficulty through Structurally Controlled Story Generation](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.14/) (Suzuki et al., Findings 2025)
ACL