@inproceedings{braun-2026-solving,
title = "Solving the Task but Not the Problem: A Customer Support Case Study on Why Extrinsic Evaluation Matters",
author = "Braun, Daniel",
editor = "Mahamood, Saad and
Howcroft, David M. and
van Deemter, Kees and
Balloccu, Simone and
Sivaprasad, Adarsa and
Sundararajan, Barkavi and
Bugar{\'i}n Diz, Alberto and
Alonso-Moral, Jose Mar{\'i}a",
booktitle = "Proceedings of the 1st Symposium on Natural Language Generation Evaluations",
month = jun,
year = "2026",
address = "Aberdeen, United Kingdom",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-retroeval/2026.retroeval-main.7/",
pages = "53--62",
ISBN = "979-8-89176-436-1",
abstract = "Natural Language Processing has long been used in customer support to automate and augment human agents. Despite its long-standing use and clear practical relevance, most scientific evaluations rely on intrinsic evaluations and metrics such as accuracy or F1-score. In this paper, we argue that such evaluations often fail to reflect real-world system impact. We present a case study of an NLP system for email-based customer support evaluated both intrinsically and extrinsically via a before-and-after study in deployment. While the system achieves strong intrinsic performance, we observe no measurable improvement in key operational metrics such as average handle time per email. These results highlight a mismatch between benchmark performance and real-world effectiveness, supporting calls for more systematic extrinsic evaluation of NLP systems."
}Markdown (Informal)
[Solving the Task but Not the Problem: A Customer Support Case Study on Why Extrinsic Evaluation Matters](https://preview.aclanthology.org/ingest-retroeval/2026.retroeval-main.7/) (Braun, RetroEval 2026)
ACL