@inproceedings{gao-etal-2025-analyzing,
title = "Analyzing and Evaluating Correlation Measures in {NLG} Meta-Evaluation",
author = "Gao, Mingqi and
Hu, Xinyu and
Lin, Li and
Wan, Xiaojun",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.111/",
pages = "2199--2222",
ISBN = "979-8-89176-189-6",
abstract = "The correlation between NLG automatic evaluation metrics and human evaluation is often regarded as a critical criterion for assessing the capability of an evaluation metric. However, different grouping methods and correlation coefficients result in various types of correlation measures used in meta-evaluation. In specific evaluation scenarios, prior work often directly follows conventional measure settings, but the characteristics and differences between these measures have not gotten sufficient attention. Therefore, this paper analyzes 12 common correlation measures using a large amount of real-world data from six widely-used NLG evaluation datasets and 32 evaluation metrics, revealing that different measures indeed impact the meta-evaluation results. Furthermore, we propose three perspectives that reflect the capability of meta-evaluation: discriminative power, ranking consistency, and sensitivity to score granularity. We find that the measure using global grouping and Pearson correlation coefficient exhibits the best performance in both discriminative power and ranking consistency. Besides, the measures using system-level grouping or Kendall correlation are the least sensitive to score granularity."
}
Markdown (Informal)
[Analyzing and Evaluating Correlation Measures in NLG Meta-Evaluation](https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.111/) (Gao et al., NAACL 2025)
ACL
- Mingqi Gao, Xinyu Hu, Li Lin, and Xiaojun Wan. 2025. Analyzing and Evaluating Correlation Measures in NLG Meta-Evaluation. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 2199–2222, Albuquerque, New Mexico. Association for Computational Linguistics.