@inproceedings{zhou-etal-2025-reg,
title = "{T}-{REG}: Preference Optimization with Token-Level Reward Regularization",
author = "Zhou, Wenxuan and
Zhang, Shujian and
Zhao, Lingxiao and
Meng, Tao",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1353/",
pages = "27876--27889",
ISBN = "979-8-89176-251-0",
abstract = "Reinforcement Learning from Human Feedback (RLHF) has been pivotal in enabling Large Language Models (LLMs) to effectively follow instructions and produce meaningful alignment by leveraging human preference data. Traditionally, RLHF involves generating responses to a query and using a separate reward model to assign a score to the entire completion. This approach, however, presents challenges, as it provides a single, sparse reward at the end of a sequence, making optimization difficult for the model, in which both training and generation occur auto-regressively at token levels. While recent methods have attempted to address this by assigning token-level discrete or continuous rewards, these often rely on either a trained credit assignment model or AI annotators, which raises concerns about the quality and reliability of the token-level rewards. In this paper, we propose T-REG, which utilizes both sequence-level and token-level rewards for preference optimization. T-REG employs self-generated token-level rewards, derived through opposite prompting, as a weak supervision signal to guide the model in distributing sequence-level rewards at the token level, thereby achieving more effective token-level credit assignment and improving alignment performance. Experiments on the instruction following benchmarks, including Alpaca Eval 2 and Arena-Hard, show that our method consistently outperforms baseline methods by up to 3.8{\%} and 4.4{\%}, respectively."
}
Markdown (Informal)
[T-REG: Preference Optimization with Token-Level Reward Regularization](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1353/) (Zhou et al., ACL 2025)
ACL