@inproceedings{zheng-etal-2025-long-form,
title = "Long-Form Information Alignment Evaluation Beyond Atomic Facts",
author = "Zheng, Danna and
Lapata, Mirella and
Pan, Jeff Z.",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/corrections-2025-11/2025.emnlp-main.558/",
doi = "10.18653/v1/2025.emnlp-main.558",
pages = "11018--11038",
ISBN = "979-8-89176-332-6",
abstract = "Information alignment evaluators are vital for various NLG evaluation tasks and trustworthy LLM deployment, reducing hallucinations and enhancing user trust. Current fine-grained methods, like FactScore, verify facts individually but neglect inter-fact dependencies, enabling subtle vulnerabilities.In this work, we introduce MontageLie, a challenging benchmark that constructs deceptive narratives by ``montaging'' truthful statements without introducing explicit hallucinations.We demonstrate that both coarse-grained LLM-based evaluators and current fine-grained frameworks are susceptible to this attack, with AUC-ROC scores falling below 65{\%}.To enable more robust fine-grained evaluation, we propose DoveScore, a novel framework that jointly verifies factual accuracy and event-order consistency. By modeling inter-fact relationships, DoveScore outperforms existing fine-grained methods by over 8{\%}, providing a more robust solution for long-form text alignment evaluation. Our code and datasets are available at https://github.com/dannalily/DoveScore."
}Markdown (Informal)
[Long-Form Information Alignment Evaluation Beyond Atomic Facts](https://preview.aclanthology.org/corrections-2025-11/2025.emnlp-main.558/) (Zheng et al., EMNLP 2025)
ACL