@inproceedings{chung-freienthal-2026-cross,
title = "Cross-Lingual Stability of {LLM} Judges Under Controlled Generation: Evidence from {F}inno-{U}gric Languages",
author = "Chung, Isaac and
Freienthal, Linda",
editor = "Chen, Pinzhen and
Zouhar, Vil{\'e}m and
Hu, Hanxu and
Khanuja, Simran and
Zhu, Wenhao and
Haddow, Barry and
Birch, Alexandra and
Aji, Alham Fikri and
Sennrich, Rico and
Hooker, Sara",
booktitle = "Proceedings of the First Workshop on Multilingual Multicultural Evaluation",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/manual-author-scripts/2026.mme-main.8/",
pages = "133--148",
ISBN = "979-8-89176-368-5",
abstract = "Cross-lingual evaluation of large language models (LLMs) typically conflates two sources of variance: genuine model performance differences and measurement instability. We investigate evaluation reliability by holding generation conditions constant while varying target language. Using synthetic customer-support dialogues generated with identical parameters across Estonian, Finnish, and Hungarian, we test whether automatic metrics and LLM-as-a-judge scoring produce stable model rankings across these morphologically rich, related Finno-Ugric languages. With a small set of Estonian native speaker annotations as a reference point, we find systematic ranking instabilities: surface-level metrics (lexical diversity, surface and semantic similarity) maintain cross-language stability, but pragmatic judgments (coherence, instruction-following) exhibit rank inversions and near-zero correlations. Because generation is controlled, these inconsistencies reflect how judge scoring behaves differently across languages rather than true model differences.This controlled design provides a diagnostic probe: evaluation methods that fail to maintain stability under identical generation conditions signal transfer failure before deployment. Our findings suggest that zero-shot judge transfer is unreliable for discourse-level assessment in morphologically rich languages, motivating language-specific calibration against targeted human baselines. We release our controlled generation protocol, synthetic data, and evaluation framework to enable replication across language families at \url{https://github.com/isaac-chung/cross-lingual-stability-judges}."
}Markdown (Informal)
[Cross-Lingual Stability of LLM Judges Under Controlled Generation: Evidence from Finno-Ugric Languages](https://preview.aclanthology.org/manual-author-scripts/2026.mme-main.8/) (Chung & Freienthal, MME 2026)
ACL