@inproceedings{vassoyan-etal-2025-ignore,
title = "Ignore the {KL} Penalty! Boosting Exploration on Critical Tokens to Enhance {RL} Fine-Tuning",
author = {Vassoyan, Jean and
Beau, Nathana{\"e}l and
Plaud, Roman},
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.findings-naacl.340/",
pages = "6108--6118",
ISBN = "979-8-89176-195-7",
abstract = "The ability to achieve long-term goals is a key challenge in the current development of large language models (LLMs). To address this, pre-trained LLMs can be fine-tuned with reinforcement learning (RL) to explore solutions that optimize a given goal. However, exploration with LLMs is difficult, as a balance has to be struck between discovering new solutions and staying close enough to the pre-trained model, so as not to degrade basic capabilities. This is typically controlled with a Kullback-Leibler (KL) penalty. In this paper, we investigate the exploration dynamics of a small language model on a simple arithmetic task. We show how varying degrees of pre-training influence exploration and demonstrate the importance of {\textquotedblleft}critical tokens{\textquotedblright} which have a dramatic impact on the final outcome. Consequently, we introduce a simple modification to the KL penalty that favors exploration on critical tokens, increasing the efficiency of the RL fine-tuning stage."
}
Markdown (Informal)
[Ignore the KL Penalty! Boosting Exploration on Critical Tokens to Enhance RL Fine-Tuning](https://preview.aclanthology.org/landing_page/2025.findings-naacl.340/) (Vassoyan et al., Findings 2025)
ACL