@inproceedings{kim-cho-2025-goodliar,
title = "{GOODLIAR}: A Reinforcement Learning-Based Deceptive Agent for Disrupting {LLM} Beliefs on Foundational Principles",
author = "Kim, Soo Kyung and
Cho, Hyunsoo",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/display_plenaries/2025.findings-acl.160/",
pages = "3076--3101",
ISBN = "979-8-89176-256-5",
abstract = "Large Language Models (LLMs) often succumb to adversarial prompts, a phenomenon popularly known as ``jailbreaking.'' While jailbreaking primarily targets short-term noncompliance with predefined policies, we argue that a deeper vulnerability lies in altering an LLM{'}s \textit{fundamental axiomatic beliefs}, such as mathematical or philosophical truths. In this work, we introduce GoodLiar, a reinforcement learning (RL)-based framework that generates deceptive contexts to systematically \textit{rewrite} an LLM{'}s core logical or philosophical understandings. By incentivizing an RL agent to produce persuasive and coherent arguments, GoodLiar aims to induce \textit{persistent} belief shifts, rather than merely influencing immediate judgments of factual truthfulness. {\%}rather than one-off policy breaches. Our approach introduces \textit{DA-ILQL}, a novel offline RL method that extends ILQL by integrating on-policy data and language exploration to enhance the language discovery and optimization. Through extensive evaluations on multiple LLMs, we show that deceptive contexts discovered by GoodLiar consistently outperform simple multi-turn prompting methods."
}
Markdown (Informal)
[GOODLIAR: A Reinforcement Learning-Based Deceptive Agent for Disrupting LLM Beliefs on Foundational Principles](https://preview.aclanthology.org/display_plenaries/2025.findings-acl.160/) (Kim & Cho, Findings 2025)
ACL