@inproceedings{xu-etal-2026-understanding,
title = "Understanding Conflicts in Multi-Objective Alignment through Reward Consistency",
author = "Xu, Zhihao and
Tong, Yongqi and
Zhang, Xin and
Zhou, Jun and
Wang, Xiting",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.269/",
pages = "5450--5472",
ISBN = "979-8-89176-395-1",
abstract = "Multi-objective preference alignment often faces alignment conflicts, where optimizing for one objective (e.g., helpfulness) degrades performance on others (e.g., harmlessness). While prior work focuses on algorithmic solutions, the intrinsic conflict within data and its theoretical impact on training remain underexplored. To bridge this gap, we introduce the principle of Reward Consistency (RC), a theory-grounded criterion that approximates the alignment conflicts via reward models. We prove that a sample mitigates conflicts if and only if it satisfies RC, thereby ensuring improvement across all objectives during optimization. Building on this, we propose Reward Consistency Sampling (RCS), an automated framework for constructing pairwise data that adheres to RC, supplemented by a relaxation strategy to enhance flexibility. Extensive experiments show that RCS brings significant and consistent performance gains, achieving an average improvement of 23.07{\%} in both harmlessness and helpfulness during simultaneous optimization comparde to the vanilla dataset. Our data-centric approach is complementary to existing alignment algorithms and effective in both sequential and simultaneous optimization scenarios."
}Markdown (Informal)
[Understanding Conflicts in Multi-Objective Alignment through Reward Consistency](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.269/) (Xu et al., Findings 2026)
ACL