@inproceedings{ishii-etal-2025-fine,
title = "Fine-grained Confidence Estimation for Spurious Correctness Detection in Large Language Models",
author = "Ishii, Ai and
Inoue, Naoya and
Suzuki, Hisami and
Sekine, Satoshi",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.68/",
pages = "1238--1257",
ISBN = "979-8-89176-298-5",
abstract = "In the deployment of Large Language Models (LLMs), ``spurious correctness''{---}where answers are correct but reasoning contains errors{---}poses a critical risk by creating an illusion of reliability. While prior work on LLM confidence estimation focuses on answer-level or entire reasoning path confidence, these coarse-grained approaches fail to identify which specific parts of the reasoning contain errors. We propose a fine-grained confidence estimation framework that computes confidence scores for individual evidence triplets within reasoning chains, enabling precise localization of errors. Using carefully designed prompts, we generate answers, evidence in triplet format, and their respective confidence scores simultaneously, allowing automatic detection of spurious correctness patterns where partial evidence contains factual errors. Evaluated on both Japanese and English multi-hop QA benchmarks across multiple models from three model families representing different architectures and training approaches, we show that our approach exhibits superior calibration performance for evidence confidence and demonstrates effective ability to detect spurious correct answers (up to 0.84 on our primary discrimination metric). The consistent improvements across languages demonstrate the generalizability of our method. As a secondary benefit, joint generation of confidence scores improves answer confidence calibration by up to 43{\%}. This prompt-based approach requires no model retraining and is immediately applicable to existing LLMs."
}Markdown (Informal)
[Fine-grained Confidence Estimation for Spurious Correctness Detection in Large Language Models](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.68/) (Ishii et al., IJCNLP-AACL 2025)
ACL
- Ai Ishii, Naoya Inoue, Hisami Suzuki, and Satoshi Sekine. 2025. Fine-grained Confidence Estimation for Spurious Correctness Detection in Large Language Models. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 1238–1257, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.