@inproceedings{chiang-lee-2024-merging,
title = "Merging Facts, Crafting Fallacies: Evaluating the Contradictory Nature of Aggregated Factual Claims in Long-Form Generations",
author = "Chiang, Cheng-Han and
Lee, Hung-yi",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-acl.160/",
doi = "10.18653/v1/2024.findings-acl.160",
pages = "2734--2751",
abstract = "Long-form generations from large language models (LLMs) contain a mix of factual and non-factual claims, making evaluating factuality difficult.Prior works evaluate the factuality of a long paragraph by decomposing it into multiple facts, verifying those facts independently, and aggregating the results.Such methods assume that combining factual claims forms a factual paragraph.The above assumption can be violated: we show that strong open-source models like Llama-chat can generate paragraphs that contain verifiable facts, but the facts are combined into a non-factual paragraph due to entity ambiguity.We further reveal that existing factuality metrics, including FActScore and citation recall, cannot properly evaluate these non-factual paragraphs and overestimate their factuality.To address this, we introduce an enhanced metric, **D-FActScore**, specifically designed for content with ambiguous entities.We evaluate the D-FActScores of people biographies generated by retrieval-augmented LLMs.We show that D-FActScore can better assess the factuality of paragraphs with entity ambiguity than FActScore.We also find that four widely used open-source LLMs tend to mix information of distinct entities to form non-factual paragraphs, making their D-FActScore much lower than FActScore by over 10{\%}."
}
Markdown (Informal)
[Merging Facts, Crafting Fallacies: Evaluating the Contradictory Nature of Aggregated Factual Claims in Long-Form Generations](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-acl.160/) (Chiang & Lee, Findings 2024)
ACL