@inproceedings{terentowicz-etal-2025-un,
title = "How (un)faithful are explainable {LLM}-based {NLG} metrics?",
author = "Terentowicz, Alex and
Lango, Mateusz and
Dusek, Ondrej",
editor = "Flek, Lucie and
Narayan, Shashi and
Phương, L{\^e} Hồng and
Pei, Jiahuan",
booktitle = "Proceedings of the 18th International Natural Language Generation Conference",
month = oct,
year = "2025",
address = "Hanoi, Vietnam",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-you-zhang-rochester/2025.inlg-main.37/",
pages = "617--658",
abstract = "Explainable NLG metrics are becoming a popular research topic; however, the faithfulness of the explanations they provide is typically not evaluated. In this work, we propose a testbed for assessing the faithfulness of span-based metrics by performing controlled perturbations of their explanations and observing changes in the final score. We show that several popular LLM evaluators do not consistently produce faithful explanations."
}Markdown (Informal)
[How (un)faithful are explainable LLM-based NLG metrics?](https://preview.aclanthology.org/author-page-you-zhang-rochester/2025.inlg-main.37/) (Terentowicz et al., INLG 2025)
ACL