@inproceedings{jin-etal-2026-arggenbench,
title = "{A}rg{G}en{B}ench: Benchmarking the Complex Controlled Argument Generation Capability of Large Language Models",
author = "Jin, Bojun and
Bao, Jianzhu and
Sun, Yang and
Zhang, Yice and
Xu, Ruifeng",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1414/",
pages = "30630--30662",
ISBN = "979-8-89176-390-6",
abstract = "Argument generation is a fundamental NLP task that aims to automatically produce persuasive arguments.Effective human argumentation is inherently complex and multifaceted, integrating argumentative strategies, appropriate styles, and adaptation to target audiences, etc.However, existing studies focus on limited control signals such as topic, stance, or key aspects, failing to capture this complexity.As LLMs advance, the lack of benchmarks evaluating multifaceted argumentative control becomes a critical bottleneck.To address this, we introduce ArgGenBench, a novel benchmark containing complex instructions that integrate multi-dimensional control, including topic, stance, length, style, strategy, audience, and key points.Extensive evaluation across 15 LLMs reveals significant limitations: even the best-performing model achieves only 42.7{\%} win rate against human-verified references.These results highlight the challenge of controlled argument generation and establish ArgGenBench as a rigorous testbed for developing more capable systems."
}