@inproceedings{thatikonda-etal-2025-assessing,
title = "Assessing the Sensitivity and Alignment of {FOL} Closeness Metrics",
author = "Thatikonda, Ramya Keerthy and
Buntine, Wray and
Shareghi, Ehsan",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.910/",
doi = "10.18653/v1/2025.findings-emnlp.910",
pages = "16775--16785",
ISBN = "979-8-89176-335-7",
abstract = "The recent successful paradigm of solving logical reasoning problems with tool-augmented large language models (LLMs) leverages translation of natural language (NL) statements into First-Order Logic (FOL) and external theorem provers. However, the correctness of FOL statements, comprising operators and text, often go unverified due to the lack of a reliable evaluation metric for comparing generated and ground-truth FOLs. In this paper, we conduct a comprehensive study on the sensitivity of existing metrics{---}NL, FOL, and graph-based{---} and their alignment with LLM as a judge on FOL evaluation to measure robustness. We introduce operator and text-based perturbations to ground-truth FOL statements to assess metric sensitivity. We then evaluate metric robustness by comparing them against LLMs judgement. Our empirical findings highlight a clear oversensitivity in the n-gram metric BLEU for text perturbations. The operator perturbation affects the semantic graph metric Smatch++ for structural changes, and the FOL metric for specific operator changes. We observe a closer alignment between BertScore and LLM judgement, proving the importance of semantic evaluation. Additionally, we show that combining metrics enhances both robustness and sensitivity compared to using individual metrics."
}Markdown (Informal)
[Assessing the Sensitivity and Alignment of FOL Closeness Metrics](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.910/) (Thatikonda et al., Findings 2025)
ACL