@inproceedings{martinez-etal-2025-semantic,
title = "Semantic Evaluation of Multilingual Data-to-Text Generation via {NLI} Fine-Tuning: Precision, Recall and F1 scores",
author = "Martinez, William Soto and
Parmentier, Yannick and
Gardent, Claire",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/display_plenaries/2025.findings-acl.542/",
pages = "10407--10427",
ISBN = "979-8-89176-256-5",
abstract = "Performance in the KG-to-Text task has improved over the years, particularly in English. However, models are still prone to mistakes like Additions and Omissions. Furthermore, few languages are taken into account since both train and test data are not readily available. In this paper, we hope to facilitate the development and improvement of multilingual KG-to-Text models by providing a multilingual evaluation framework that is reference-less (no need for test data) and permits estimating how much a KG-to-Text Model under- (omission) or over- (addition) generates. We focus on two high (English, Russian) and five low (Breton, Irish, Maltese, Welsh, Xhosa) resource languages and show that our metric has fair to moderate correlation with reference-based metrics, positioning it as a consistent alternative when no references are available. We also show that our metric outperforms prior reference-less metrics in correlation with existing human judgments. Additional human evaluation shows moderate to strong correlation with human annotators in assessing precision and recall at a higher granularity level than shown in previous studies. Since our metric provides scores for precision and recall, it helps better assess the level of over- or under-generation of multilingual KG-to-Text models."
}
Markdown (Informal)
[Semantic Evaluation of Multilingual Data-to-Text Generation via NLI Fine-Tuning: Precision, Recall and F1 scores](https://preview.aclanthology.org/display_plenaries/2025.findings-acl.542/) (Martinez et al., Findings 2025)
ACL