@inproceedings{kim-kang-2025-exploration,
title = "Exploration-Driven Reinforcement Learning for Expert Routing Improvement in Mixture-of-Experts Language Models",
author = "Kim, Gyunyeop and
Kang, Sangwoo",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1282/",
doi = "10.18653/v1/2025.findings-emnlp.1282",
pages = "23592--23605",
ISBN = "979-8-89176-335-7",
abstract = "The performance of MoE-based LLMs depends on the router{'}s ability to select suitable experts; however, the router is typically not explicitly supervised to acquire this routing ability. We propose Exploration-Driven Reinforcement Learning (ERL), which explicitly optimizes the router by exploration of alternative routing paths. For every input, ERL evaluates by (i) the original routing path and (ii) paths in which an $\alpha$-fraction of routing decisions is randomly perturbed, and treats their performance gap as an advantage signal in a reinforcement learning. Moreover, MoE-ERL$_{wPL}$ mitigates the risk of performance collapse caused by routing reinforcement learning{--}induced expert over-specialization by intentionally enforcing overlap in experts' knowledge. Without adding parameters or external reward models, our method improves summarization (SAMSum, XSUM), question answering (SQuAD), and language modeling (WikiText-2), and raises routing quality, delivering up to 8.9 {\texttimes} higher MRR than baselines over 100 perturbed routing paths. Code is available at our github."
}Markdown (Informal)
[Exploration-Driven Reinforcement Learning for Expert Routing Improvement in Mixture-of-Experts Language Models](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1282/) (Kim & Kang, Findings 2025)
ACL