@inproceedings{thakur-etal-2025-mirage,
title = "{MIRAGE}-Bench: Automatic Multilingual Benchmark Arena for Retrieval-Augmented Generation Systems",
author = "Thakur, Nandan and
Kazi, Suleman and
Luo, Ge and
Lin, Jimmy and
Ahmad, Amin",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.14/",
pages = "274--298",
ISBN = "979-8-89176-189-6",
abstract = "Traditional retrieval-augmented generation (RAG) benchmarks evaluate systems using heuristic-based metrics, but these require human preferences as the ground truth for reference. In contrast, arena-based benchmarks, where systems compete against each other, require an expensive large language model (LLM) as a judge for a reliable evaluation. We present a simple efficient technique to combine the best of both worlds. The idea is to train a surrogate judge using heuristic metrics as input, to output the LLM as a judge prediction.In our work, we develop MIRAGE-Bench, a synthetic arena-based RAG benchmark for 18 diverse languages on Wikipedia focused on multilingual answer generation evaluation. It extensively couples both heuristic features and LLM as a judge for evaluation. We benchmark 19 multilingual LLMs, and observe a high correlation (Kendall Tau ($\tau$) = 0.909) using our surrogate judge and between GPT-4o as a teacher using the Bradley-Terry framework. Our results show proprietary and large open-source LLMs currently dominate on MIRAGE-Bench. Our code and datasets are made publicly available here: https://github.com/vectara/mirage-bench."
}
Markdown (Informal)
[MIRAGE-Bench: Automatic Multilingual Benchmark Arena for Retrieval-Augmented Generation Systems](https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.14/) (Thakur et al., NAACL 2025)
ACL