@inproceedings{yin-shi-2026-individual,
title = "From Individual to Common: An Early Exploration of Consensus in Non-verifiable Data for Balanced Preference Optimization",
author = "Yin, Shangjian and
Shi, Zhouxing",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1598/",
pages = "34612--34630",
ISBN = "979-8-89176-390-6",
abstract = "Reinforcement Learning with Verifiable Rewards (RLVR) has demonstrated remarkable effectiveness in boosting the objective performance (e.g., reasoning) of Large Language Models (LLMs) through rule-based, on-policy self-improvement strategies. However, optimizing LLMs for subjective capabilities and alignment with human preferences remains challenging due to the non-verifiable nature. Most prior works use datasets comprising response pairs with substantial quality gaps labeled by a strong external judge. While effective for preference metrics, this paradigm often incurs an ``alignment tax'', where the model{'}s objective performance on downstream tasks degrades as it overfits to subjective preferences. In this work, we introduce Donkey, a high-quality, non-verifiable dataset where response pairs differ only by subtle nuances. We find that LLMs optimized on Donkey via preference learning outperform those trained on data with explicit quality gaps, while simultaneously maintaining their objective capabilities. Furthermore, we observe that preference signals on Donkey can be decomposed into consensus preferences and individual preferences. Our analysis reveals that distilling consensus preferences provides a significantly more data-efficient signal for preference optimization. Our findings underscore the importance of leveraging nuanced preference signals and the consensus of multiple judges for advancing subjective LLM alignment. Our code and data will be available at https://github.com/SJY8460/Donkey."
}Markdown (Informal)
[From Individual to Common: An Early Exploration of Consensus in Non-verifiable Data for Balanced Preference Optimization](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1598/) (Yin & Shi, ACL 2026)
ACL