@inproceedings{cheng-etal-2025-weaponization,
title = "On Weaponization-Resistant Large Language Models with Prospect Theoretic Alignment",
author = "Cheng, Zehua and
Zhang, Manying and
Sun, Jiahao and
Dai, Wei",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.coling-main.687/",
pages = "10309--10324",
abstract = "Large language models (LLMs) have made significant advancements, but their increasing capabilities present serious risks of misuse, particularly in open-weight models where direct access to the model{'}s parameters is possible. Current safeguards, designed for closed-weight API models, are inadequate for open-weight models, as minimal fine-tuning can bypass these protections. Preserving the integrity of open-weight LLMs before deployment has thus become a critical challenge. We argue that these vulnerabilities stem from the overemphasis on maximizing the LLM{'}s log-likelihood during training, which amplifies data biases, especially with large datasets. To address these issues, we introduce Kahneman and Tversky{'}s Prospect Theoretic Integrity Preserving Alignment (KT-IPA), a framework that prioritizes maximizing generative utility rather than a singular optimization metric. This approach strengthens LLMs against misuse and weaponization while maintaining high performance, even after extensive fine-tuning. Our results demonstrate that integrating prospect theory into LLM training enhances robustness, security, and responsible innovation in this rapidly evolving field. Our codes are available on https://anonymous.4open.science/r/KT-IPA-40B7"
}
Markdown (Informal)
[On Weaponization-Resistant Large Language Models with Prospect Theoretic Alignment](https://preview.aclanthology.org/fix-sig-urls/2025.coling-main.687/) (Cheng et al., COLING 2025)
ACL