@inproceedings{ko-etal-2025-building,
title = "Building Helpful-Only Large Language Models: A Complete Approach from Motivation to Evaluation",
author = "Ko, Donghyeon and
Yang, Sohee and
Kwak, Donghyun and
Lee, Sang-Woo",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.7/",
pages = "116--131",
ISBN = "979-8-89176-303-6",
abstract = "Reinforcement learning from AI feedback (RLAIF) is widely used for customizing the safety policies of large language models (LLMs) at scale. However, standard aligned LLMs are poorly suited in this setting, as their fixed alignment prevents adaptation to new policies. To address this, prior works have employed $\textbf{Helpful-Only LLMs (HOLLMs)}$. Despite their effectiveness, no public framework exists for training or evaluating HOLLMs. In this paper, we present a comprehensive framework for developing HOLLMs that enable custom safety alignment. We first define the key attributes of a HOLLM and then propose $\textbf{Refusal-Avoidant Instruction Learning (RAIL)}$, a novel training method that constructs HOLLMs from open-source datasets. We also introduce a comprehensive evaluation framework including a new benchmark: $\textbf{Helpfulness Evaluation without Limitations from Policies (HELP)}$. Experiments show that the HOLLM achieves a 30.28{\%} reduction in refusal rate over the strongest refusal-optimized baseline without compromising general capabilities. The HOLLM also achieves a 29.25{\%} higher accuracy on HELP compared to the best-performing baseline. These results demonstrate that RAIL effectively cultivates the key attributes required of a HOLLM."
}Markdown (Informal)
[Building Helpful-Only Large Language Models: A Complete Approach from Motivation to Evaluation](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.7/) (Ko et al., Findings 2025)
ACL
- Donghyeon Ko, Sohee Yang, Donghyun Kwak, and Sang-Woo Lee. 2025. Building Helpful-Only Large Language Models: A Complete Approach from Motivation to Evaluation. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 116–131, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.