@inproceedings{li-etal-2026-reward,
title = "Reward Alignment Optimization: A Direct Point-wise Alignment Approach",
author = "Li, Zelin and
Leng, Jia and
Song, Dawei and
Hu, Yangen",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.2027/",
pages = "43770--43784",
ISBN = "979-8-89176-390-6",
abstract = "Direct Alignment Algorithms (DAAs) such as DPO simplify RLHF by optimizing policies directly from preference pairs. However, the Bradley{--}Terry probability-gap objective can induce likelihood displacement and, under weak KL constraints, may even reduce the probability of preferred responses, while implicit rewards can be limited in generalizaiton. We propose Reward Alignment Optimization (RAO), a point-wise direct alignment method that uses an explicit reward model to specify exact target generation probabilities and align the policy offline towards them. Our key insight is a theoretical principle we call ``prefix consistency'', which links the normalization terms of prompts that share a prefix. Leveraging this property, RAO decouples target reward differentials from bias terms, prevents decreasing preferred-response probabilities, and better exploits reward information both within and across prompts. Extensive experiments on multiple base LLMs show that RAO consistently outperforms existing DAAs while enabling controllable target probability distributions."
}Markdown (Informal)
[Reward Alignment Optimization: A Direct Point-wise Alignment Approach](https://preview.aclanthology.org/ingest-acl/2026.acl-long.2027/) (Li et al., ACL 2026)
ACL