@inproceedings{xu-etal-2025-mwpo,
title = "{MWPO}: Enhancing {LLM}s Performance through Multi-Weight Preference Strength and Length Optimization",
author = "Xu, Shiyue and
Zhang, Fu and
Cheng, Jingwei and
Zhou, Linfeng",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.findings-acl.1057/",
pages = "20566--20581",
ISBN = "979-8-89176-256-5",
abstract = "Direct Preference Optimization (DPO) have proposed offline alternatives to Reinforcement Learning from Human Feedback (RLHF). In DPO, each preference pair, which serves as the foundation for learning, is typically constructed by first generating multiple responses to the same instruction and then annotating them to indicate the preferred choice. However, when the responses are highly similar, the weak preference signal can introduce annotation noise, which may hinder model optimization. Additionally, DPO suffers from the drawback of over-optimizing for verbose generation. A potential reason is the presence of length bias in preference datasets, which can lead to length exploitation. To address these issues, we propose a DPO-based **m**ulti-**w**eight **p**reference strength and length **o**ptimization (MWPO) method. Specifically, we propose to reweight preference pairs based on implicit reward margins and response length margins, unifying them through a geometric mixture to generate synthetic weights for optimization. This method allows preference pairs with stronger preference signals or more favorable length feature to have a more pronounced impact on model parameters. Moreover, our method does not require additional annotators. We validate our method on models of four different scales across multiple benchmarks. Our method surpasses state-of-the-art (SOTA) baselines, outperforming DPO by up to 8.7{\%} on AlpacaEval 2 while reducing generation length by 9.4{\%} in the Mistral setting. Our code is available at https://github.com/AIR-hl/MWPO."
}
Markdown (Informal)
[MWPO: Enhancing LLMs Performance through Multi-Weight Preference Strength and Length Optimization](https://preview.aclanthology.org/landing_page/2025.findings-acl.1057/) (Xu et al., Findings 2025)
ACL