@article{feraru-etal-2026-referenceless,
title = "Referenceless Evaluation of Machine Translation Models by Ranking Performance in {R}omanian to {E}nglish Translate-train Settings",
author = "Feraru, Mihail and
Diaconu, Alexandra and
Alexe, Bogdan Dumitru",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.693/",
pages = "8813--8823",
abstract = "We propose a referenceless evaluation method for machine translation (MT) models by assessing their performance in translate-train scenarios across a variety of natural language processing (NLP) tasks. The approach ranks MT systems based on the downstream impact of their translations on independent NLP models trained on translated data, thus eliminating the need for professional ground-truth references. We evaluate four prominent MT tools {---} ChatGPT{~}3.5{~}Turbo, DeepL, Google{~}Translate, and Mistral{~}7B{~}Instruct{~}v0.2 {---} on the Romanian$\rightarrow$English language pair and analyze their influence on text summarization, sentiment analysis, and authorship identification. To further test the generalization and robustness of our method, we extend the evaluation to a cross-modality setup using out-of-domain speech data. In this setting, speech segments are transcribed with Whisper-Large, translated into English, and used in a four-class domain classification task (children{'}s stories, audiobooks, film dialogues, podcasts). Our findings show that translation improves downstream performance for sentiment analysis and summarization, while stylistically rich texts such as poetry or noisy ASR transcriptions suffer degradation. The proposed ranking metric correlates strongly with human judgments and remains sensitive to translation quality even in multimodal pipelines, providing a scalable and practical alternative to reference-based MT evaluation."
}Markdown (Informal)
[Referenceless Evaluation of Machine Translation Models by Ranking Performance in Romanian to English Translate-train Settings](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.693/) (Feraru et al., LREC 2026)
ACL