@inproceedings{ding-2026-modeling,
title = "Modeling Human Adversarial Strategy Adaptation in Multi-Turn Language Model Interactions",
author = "Ding, Zijun",
editor = "Bonial, Claire and
Berzak, Yevgeni",
booktitle = "Proceedings of the 30th Conference on Computational Natural Language Learning",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.conll-main.22/",
pages = "382--394",
ISBN = "979-8-89176-410-1",
abstract = "Adversarial red teaming is a central component of large language model (LLM) safety evaluation. While prior work has cataloged attack types and measured aggregate failure rates, less attention has been paid to the structured decision-making behavior of human attackers in multi-turn interaction. In this work, we model adversarial dialogue as a hierarchical and sequential process. We introduce a structured representation that decomposes red teaming conversations into goals, strategies, and tactics, where strategies capture distinct vulnerability dimensions and tactics operationalize these strategies at the linguistic level. Using 38,961 multi-turn conversations from a large-scale red teaming dataset, we analyze both first-turn strategy effects and multi-turn adaptation dynamics. Causal estimation reveals systematic differences in success rates across strategic categories. Predictive modeling further shows that incorporating structured strategy, tactic, and adaptation features improves AUC from 0.719 to 0.746 over a baseline without structure. Our findings suggest that adversarial effectiveness is not uniform but varies across structured vulnerability dimensions, and that modeling red teaming as sequential strategic interaction provides measurable explanatory and predictive gains."
}Markdown (Informal)
[Modeling Human Adversarial Strategy Adaptation in Multi-Turn Language Model Interactions](https://preview.aclanthology.org/ingest-acl-workshops/2026.conll-main.22/) (Ding, CoNLL 2026)
ACL