@inproceedings{wu-last-2025-transitive,
title = "Transitive self-consistency evaluation of {NLI} models without gold labels",
author = "Wu, Wei and
Last, Mark",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1152/",
pages = "22637--22653",
ISBN = "979-8-89176-332-6",
abstract = "Natural Language Inference (NLI) is an important task in natural language processing. NLI models are aimed at automatically determining logical relationships between pairs of sentences. However, recent studies based on gold labels assigned to sentence pairs by human experts have provided some evidence that NLI models tend to make inconsistent model decisions during inference. Previous studies have used existing NLI datasets to test the transitive consistency of language models. However, they test only variations of two transitive consistency rules out of four. To further evaluate the transitive consistency of NLI models, we propose a novel evaluation approach that allows us to test all four rules automatically by generating adversarial examples via antonym replacements. Since we are testing self-consistency, human labeling of generated adversarial examples is unnecessary. Our experiments on several benchmark datasets indicate that the examples generated by the proposed antonym replacement methodology can reveal transitive inconsistencies in the state-of-the-art NLI models."
}Markdown (Informal)
[Transitive self-consistency evaluation of NLI models without gold labels](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1152/) (Wu & Last, EMNLP 2025)
ACL