@inproceedings{lin-etal-2026-clarity,
title = "{CLAR}ity: Reasoning Consistency Alone Can Teach Reinforced Experts",
author = "Lin, Jiuheng and
Jiang, Cong and
Wu, Zirui and
Sun, Jiarui and
Feng, Yansong",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1358/",
pages = "29460--29480",
ISBN = "979-8-89176-390-6",
abstract = "Training expert LLMs in domains with scarce fine-grained annotated data is admittedly challenging, often relying on multiple-choice questions (MCQs). However, standard outcome-based reinforcement learning (RL) on MCQs is risky. While outcome-based RL may improve accuracy, it frequently compromises the reasoning process, yielding internally inconsistent rationales that diverge from the final predictions. Existing solutions to supervise the reasoning process, such as large-scale Process Reward Models (PRMs), are prohibitively expensive. To address this, we propose CLARity, a cost-effective RL framework that enhances reasoning quality using a small, general-purpose LLM only. CLARity integrates a consistency-aware reward mechanism with a 2-stage refine-then-monitor training pipeline to enhance reasoning consistency, and a dynamic data reformulation strategy to better exploit annotated data available. Experiments demonstrate that CLARity can improve the consistency of responses by 16.5{\%} over standard outcome-based RL, and bring an improvement of 7.5{\%} in final accuracy. Human evaluations further confirm substantial gains in factual correctness and reasoning coherence, leading to more trustworthy model outputs. Thus, CLARity offers a generalizable solution that enables smaller models to effectively guide expert LLM training by monitoring reasoning consistency."
}Markdown (Informal)
[CLARity: Reasoning Consistency Alone Can Teach Reinforced Experts](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1358/) (Lin et al., ACL 2026)
ACL
- Jiuheng Lin, Cong Jiang, Zirui Wu, Jiarui Sun, and Yansong Feng. 2026. CLARity: Reasoning Consistency Alone Can Teach Reinforced Experts. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 29460–29480, San Diego, California, United States. Association for Computational Linguistics.