@inproceedings{tiwari-2026-component,
title = "Component Transfer Can Exceed Full Model Performance: Investigating Post-Trained Mixture-of-Experts",
author = "Tiwari, Rabin",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.7/",
pages = "77--83",
ISBN = "979-8-89176-423-1",
abstract = "Post-training methods such as supervised fine-tuning and preference optimization are widely used to align large language models, yet how their benefits distributeacross architectural components and transfer across tasks and prompts remains unclear. In this work, we analyze component-level transfer in aMixture-of-Experts language model by selectively replacing routers, attention modules, and expert networks between two post-trained Mixture of Experts models trained with different post-training recipes and dataset mixtures. Starting from a SFT+DPO checkpoint, we systematically replace its components (routers, attention, experts) with those from a Tulu3 checkpoint and evaluate the impact of each replacement and their combinations on mathematical and scientific reasoningand a general-purpose classification task under zero-shot, few-shot and Chain of Thought prompting. We find strong component-specific specialization: expert networksaccount for most gains on mathematical and scientific reasoning, while attention mechanisms consistently outweigh expert transfer on general tasksand router transfer alone provides minimal benefit or harms performance. Prompting strategy further modulates these effects, with expert transfer degrading zero-shot scienceperformance but improving few-shot reasoning. Strategically combining components from different model versions can in some cases match or exceed the performance of the best available model, motivating principled techniques for composing post-trained models into task- and prompt-specific systems without additional training."
}Markdown (Informal)
[Component Transfer Can Exceed Full Model Performance: Investigating Post-Trained Mixture-of-Experts](https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.7/) (Tiwari, GEM 2026)
ACL