@inproceedings{hong-etal-2025-consistencychecker,
title = "{C}onsistency{C}hecker: Tree-based Evaluation of {LLM} Generalization Capabilities",
author = "Hong, Zhaochen and
Yu, Haofei and
You, Jiaxuan",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.acl-long.1585/",
pages = "33039--33075",
ISBN = "979-8-89176-251-0",
abstract = "Evaluating Large Language Models (LLMs) requires effective methods to assess semantic consistency across multiple reversible transformations. Traditional self-consistency methods often fail to capture subtle semantic errors in multi-step tasks. We introduce ConsistencyChecker, a tree-based evaluation framework that measures LLMs' ability to preserve semantic consistency during reversible transformation processes, sidestepping benchmark data contamination issues. Our approach constructs self-consistency trees where nodes represent text states after transformations (e.g., translation, code modification, paraphrasing) and edges represent pairs of opposite transformations. By analyzing semantic preservation between nodes at different tree depths, ConsistencyChecker quantifies model reliability without requiring manually annotated reference data. Experiments demonstrate that ConsistencyChecker reliably measures generalization abilities across models from 1.5B to 72B parameters. On translation tasks, GPT-4o Mini achieves the highest L3 consistency score of 98.0{\%}. For code generation, Qwen 2.5 32B leads with 85.1{\%} semantic consistency at L3. Results show Pearson correlation greater than 0.7 between our embedding-based scores and WMT 2024 rankings on 4 out of 5 shared language pairs, validating the method{'}s effectiveness for benchmarking LLM performance without constructing new datasets."
}
Markdown (Informal)
[ConsistencyChecker: Tree-based Evaluation of LLM Generalization Capabilities](https://preview.aclanthology.org/landing_page/2025.acl-long.1585/) (Hong et al., ACL 2025)
ACL