@inproceedings{gupta-etal-2025-found,
title = "Found in Translation: Measuring Multilingual {LLM} Consistency as Simple as Translate then Evaluate",
author = "Gupta, Ashim and
Mehta, Maitrey and
Xu, Zhichao and
Srikumar, Vivek",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.185/",
pages = "3477--3496",
ISBN = "979-8-89176-298-5",
abstract = "Large language models (LLMs) provide detailed and impressive responses to queries in English. However, are they really consistent at responding to the same query in other languages? The popular way of evaluating for multilingual performance of LLMs requires expensive-to-collect annotated datasets. Further, evaluating for tasks like open-ended generation, where multiple correct answers may exist, is nontrivial. Instead, we propose to evaluate the predictability of model response across different languages. In this work, we propose a framework to evaluate LLM{'}s cross-lingual consistency based on a simple Translate then Evaluate strategy. We instantiate this evaluation framework along two dimensions of consistency: information and empathy. Our results reveal pronounced inconsistencies in popular LLM responses across thirty languages, with severe performance deficits in certain language families and scripts, underscoring critical weaknesses in their multilingual capabilities. These findings necessitate cross-lingual evaluations that are consistent along multiple dimensions. We invite practitioners to use our framework for future multilingual LLM benchmarking."
}Markdown (Informal)
[Found in Translation: Measuring Multilingual LLM Consistency as Simple as Translate then Evaluate](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.185/) (Gupta et al., IJCNLP-AACL 2025)
ACL
- Ashim Gupta, Maitrey Mehta, Zhichao Xu, and Vivek Srikumar. 2025. Found in Translation: Measuring Multilingual LLM Consistency as Simple as Translate then Evaluate. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 3477–3496, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.