@inproceedings{sha-zhu-2026-verifying,
title = "Verifying the Subjective: Structured Multilingual Rewards for Low-Resource Alignment",
author = "Sha, Jiu and
Zhu, Mengxiao",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1174/",
pages = "23442--23474",
ISBN = "979-8-89176-395-1",
abstract = "Aligning LLMs in low-resource multilingual settings faces a fundamental reward bottleneck: scalar rewards lack cultural generalization, while unstructured critiques remain noisy and unverifiable. To bridge this gap, we introduce a Structured Multilingual Reward Modeling Framework that extends Reinforcement Learning with Verifiable Rewards (RLVR) to subjective and open-ended tasks. The framework unifies three core components to transform abstract quality into concrete supervision: (1) a Structured Checklist Schema decomposing evaluation into granular universal reasoning steps and task-specific criteria; (2) Structured Generative Critique Modeling, which produces rubric-aligned critiques with grounded justifications; and (3) Adaptive Multilingual Reward Optimization, integrating reasoning quality and language consistency into a verifiable objective. We integrate this framework into a bootstrapped Group Relative Policy Optimization pipeline, augmented by length-aware normalization and variance stabilization to ensure stability. Extensive experiments on a newly constructed suite covering 7 subjective task categories across 50 low-resource languages demonstrate that this checklist-driven approach yields substantial improvements in reasoning capability and response quality, particularly in settings where traditional reward models exhibit significant degradation. We publicly release our models and the corresponding evaluation benchmark to facilitate further research. Our code is available at \url{https://github.com/Shajiu/SGCM}."
}Markdown (Informal)
[Verifying the Subjective: Structured Multilingual Rewards for Low-Resource Alignment](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1174/) (Sha & Zhu, Findings 2026)
ACL