@inproceedings{chen-etal-2026-three,
title = "A Three-Level Audit of {LLM} Alignment for Argument Quality Assessment",
author = "Chen, Wei-Fan and
Yu, Jinming and
Flek, Lucie",
editor = "Elaraby, Mohamed and
Hautli-Janisz, Annette and
Romberg, Julia and
Musi, Elena and
Ruggeri, Federico and
Lawrence, John",
booktitle = "Proceedings of the 13th Workshop on Argument Mining and Reasoning",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.argmining-1.3/",
pages = "19--31",
ISBN = "979-8-89176-399-9",
abstract = "Large Language Models (LLMs) are increasingly used as automated evaluators of argument quality. However, existing studies typically assess models only through their agreement with human scores, leaving the reasoning process behind these judgments unexplored. In this paper, we propose a three-level audit framework for evaluating the reliability of LLM-based argument quality assessment. The framework distinguishes between (1) surface alignment, measuring agreement between LLM-predicted scores and human annotations; (2) instructional alignment, assessing whether generated rationales adhere to the intended evaluation criteria; and (3) faithfulness alignment, examining whether predicted scores are supported by the generated rationales. To operationalize this audit, we introduce structural rationale prompting, which guides LLMs to generate structured justifications before assigning scores across 11 dimensions of the Dagstuhl-15512 argument quality corpus. We evaluate several LLMs under this framework and find that structural rationale prompting substantially improves agreement with human annotations compared to definition-based prompting. Furthermore, the generated rationales generally follow the evaluation instructions and remain highly consistent with the predicted scores. Overall, our results suggest that auditing LLM evaluators beyond surface score agreement provides deeper insight into the reliability and transparency of LLM-based evaluation."
}Markdown (Informal)
[A Three-Level Audit of LLM Alignment for Argument Quality Assessment](https://preview.aclanthology.org/ingest-acl-workshops/2026.argmining-1.3/) (Chen et al., ArgMining 2026)
ACL