@inproceedings{kim-2026-coercion,
title = "Coercion Suppression Increases Preference Hallucinations via a Deceptive Bypass in $K$-Level Negotiation Agents",
author = "Kim, Jihye",
editor = "Chang, Kai-Wei and
Mehrabi, Ninareh and
Krishna, Satyapriya and
Das, Anubrata and
Dhamala, Jwala and
Cao, Yang Trista and
Kumarage, Tharindu and
Ramakrishna, Anil and
Christodoulopoulos, Christos and
Wan, Yixin and
Galystan, Aram and
Kumar, Anoop and
Gupta, Rahul",
booktitle = "Proceedings of the 6th Workshop on Trustworthy {NLP} ({T}rust{NLP} 2026)",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.trustnlp-main.17/",
pages = "287--294",
ISBN = "979-8-89176-418-7",
abstract = "K-Level reasoning{---}recursive modeling of opponent beliefs{---}improves LLM negotiation utility but frequently elicits coercive and toxic behaviors that undermine real-world deployability. We propose an Observer{--}Planner{--}Actor architecture with a Modular Appraisal Gate that (i) dynamically estimates the opponent{'}s cognitive level and (ii) filters hostile drafts via an LLM-as-a-judge. In randomized interventions on the CaSiNo dataset, our gated agent eliminates toxicity (0{\%}) and reduces coercion from 35{\%} to 6{\%} compared to a strong static-K baseline, albeit with an alignment tax in utility. However, the gate does not reduce preference hallucinations{---}strategic misrepresentation of the agent{'}s own priorities. K-Level reasoning incidentally suppresses this behavior (from 35{\%} in a vanilla baseline to 22{\%}), but gating coercion releases the suppression, returning hallucination to vanilla-baseline levels (33{--}37{\%}). We term this pattern a deceptive bypass: output-level filters address the form of hostility but leave surface-compliant manipulation channels intact, demonstrating that they alone are insufficient to align utility-driven strategic agents."
}Markdown (Informal)
[Coercion Suppression Increases Preference Hallucinations via a Deceptive Bypass in K-Level Negotiation Agents](https://preview.aclanthology.org/ingest-acl-workshops/2026.trustnlp-main.17/) (Kim, TrustNLP 2026)
ACL