@inproceedings{mamta-cocarascu-2025-facteval,
title = "{F}act{E}val: Evaluating the Robustness of Fact Verification Systems in the Era of Large Language Models",
author = "Mamta, Mamta and
Cocarascu, Oana",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.534/",
pages = "10647--10660",
ISBN = "979-8-89176-189-6",
abstract = "Whilst large language models (LLMs) have made significant advances in every natural language processing task, studies have shown that these models are vulnerable to small perturbations in the inputs, raising concerns about their robustness in the real-world. Given the rise of misinformation online and its significant impact on society, fact verification is one area in which assessing the robustness of models developed for this task is crucial. However, the robustness of LLMs in fact verification remains largely unexplored. In this paper, we introduce FactEval, a novel large-scale benchmark for extensive evaluation of LLMs in the fact verification domain covering 17 realistic word-level and character-level perturbations and 4 types of subpopulations. We investigate the robustness of several LLMs in zero-shot, few-shot, and chain-of-thought prompting. Our analysis using FEVER, one of the largest and most widely-used datasets for fact verification, reveals that LLMs are brittle to small input changes and also exhibit performance variations across different subpopulations."
}
Markdown (Informal)
[FactEval: Evaluating the Robustness of Fact Verification Systems in the Era of Large Language Models](https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.534/) (Mamta & Cocarascu, NAACL 2025)
ACL