@inproceedings{reddy-etal-2026-biasgrpo,
title = "{B}ias{GRPO}: Stabilizing Bias Mitigation in High-Variance Reward Landscapes via Group-Relative Policy Optimization",
author = "Reddy, Saket and
Yang, Ke and
Zhai, ChengXiang",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.2052/",
pages = "41250--41267",
ISBN = "979-8-89176-395-1",
abstract = "Mitigating social bias in Large Language Models (LLMs) presents a distinct alignment challenge: unlike verifiable tasks, social bias lacks a single ground truth, creating a high-variance, subjective reward landscape. Previous preference-based fine-tuning methods have major trade-offs: Direct Preference Optimization (DPO) is limited by the lack of exploration inherent in offline training, while Proximal Policy Optimization (PPO) can lead to training instability due to potentially unreliable critic estimates. In this paper, we propose BiasGRPO, an adaptation of Group Relative Policy Optimization (GRPO) that stabilizes alignment by normalizing rewards across a group of sampled completions. By substituting the value function with a group-relative baseline, our approach reduces instability while maintaining the exploration benefits of online reinforcement learning. To adapt GRPO, we curate and synthetically extend a dataset spanning multiple domains and contexts, and create a custom, bias-specific reward model for effectively guiding generation while avoiding knowledge degradation. We find that BiasGRPO outperforms DPO and PPO across multiple benchmarks, indicating its effectiveness as an alignment technique that can overcome the limitations of previous preference-based methods."
}Markdown (Informal)
[BiasGRPO: Stabilizing Bias Mitigation in High-Variance Reward Landscapes via Group-Relative Policy Optimization](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.2052/) (Reddy et al., Findings 2026)
ACL