@inproceedings{turkmen-keles-2026-trust,
title = "Whom to Trust? Analyzing the Divergence Between User Satisfaction and {LLM}-as-a-Judge in {E}-Commerce {RAG} Systems",
author = {T{\"u}rkmen, Arif and
Kele{\c{s}}, Kaan Efe},
editor = "Chen, Pinzhen and
Zouhar, Vil{\'e}m and
Hu, Hanxu and
Khanuja, Simran and
Zhu, Wenhao and
Haddow, Barry and
Birch, Alexandra and
Aji, Alham Fikri and
Sennrich, Rico and
Hooker, Sara",
booktitle = "Proceedings of the First Workshop on Multilingual Multicultural Evaluation",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/manual-author-scripts/2026.mme-main.12/",
pages = "189--195",
ISBN = "979-8-89176-368-5",
abstract = "We study retrieval-augmented generation (RAG) evaluation in the Trendyol QA Assistant using 150k real e-commerce interactions. Our framework combines user satisfaction labels, LLM-as-a-judge scoring, and factor-based diagnostics to separate retrieval from generation errors. We find that judge models broadly reflect user satisfaction trends, though important nuances of dissatisfaction are often missed. Factor-level analysis highlights systematic error patterns across query types and context quality, demonstrating that hybrid evaluation, combining multiple LLM judges with direct user feedback offers the most reliable assessment strategy for production RAG systems."
}Markdown (Informal)
[Whom to Trust? Analyzing the Divergence Between User Satisfaction and LLM-as-a-Judge in E-Commerce RAG Systems](https://preview.aclanthology.org/manual-author-scripts/2026.mme-main.12/) (Türkmen & Keleş, MME 2026)
ACL