@inproceedings{xu-etal-2026-debateqa,
title = "{D}ebate{QA}: Evaluating Question Answering on Debatable Knowledge",
author = "Xu, Rongwu and
Qi, Xuan and
Qi, Zehan and
Xu, Wei and
Guo, Zhijiang",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.44/",
pages = "854--885",
ISBN = "979-8-89176-386-9",
abstract = "The rise of large language models (LLMs) has enabled us to seek answers to inherently debatable questions on LLM chatbots, necessitating a reliable way to evaluate their ability. However, traditional QA benchmarks assume fixed answers are inadequate for this purpose. To address this, we introduce DebateQA, a dataset of 2,941 debatable questions, each accompanied by multiple human-annotated partial answers that capture a variety of perspectives. We develop two metrics: Perspective Diversity, which evaluates the comprehensiveness of perspectives, and Dispute Awareness, which assesses if the LLM acknowledges the question{'}s debatable nature. Experiments demonstrate that both metrics are aligned with human preferences and stable across different underlying models. Using DebateQA with two metrics, we assess 12 prevalent LLMs and retrieval-augmented generation methods. Our findings reveal that while LLMs generally excel at recognizing debatable issues, their ability to provide comprehensive answers encompassing diverse perspectives varies considerably."
}Markdown (Informal)
[DebateQA: Evaluating Question Answering on Debatable Knowledge](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.44/) (Xu et al., Findings 2026)
ACL