@inproceedings{chamma-etal-2026-mixturekit,
title = "{M}ixture{K}it: A General Framework for Composing, Training, and Visualizing Mixture-of-Experts Models",
author = "Chamma, Ahmad and
El Herraoui, Omar and
Shang, Guokan",
editor = "Durrett, Greg and
Jian, Ping",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 3: System Demonstrations)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-demo.15/",
pages = "148--156",
ISBN = "979-8-89176-392-0",
abstract = "We introduce MixtureKit, a modular open-source framework for constructing, training, and analyzing Mixture-of-Experts (MoE) models from arbitrary pre-trained or fine-tuned checkpoints. MixtureKit supports three complementary strategies: (i) Traditional MoE, using a single router per transformer block to select experts; (ii) BTX (Branch-Train-Mix), adding routers at user-specified sub-layers for fine-grained token routing; and (iii) BTS (Branch-Train-Stitch), preserving experts intact and introducing lightweight stitch layers for controlled hub{--}expert information exchange. Given a single configuration dictionary, MixtureKit automatically modifies model configuration, patches decoder and causal LM classes, and exports a unified transformers-compatible checkpoint ready for inference or further fine-tuning. We also provide a visualization interface to inspect token routing, expert weight distributions, and layer-wise contributions. Experiments on multilingual code-switched (Arabic{--}Latin) data show that BTX models built with MixtureKit can outperform dense baselines across multiple benchmarks. The library is accessible at: https://github.com/MBZUAI-Paris/MixtureKit."
}Markdown (Informal)
[MixtureKit: A General Framework for Composing, Training, and Visualizing Mixture-of-Experts Models](https://preview.aclanthology.org/ingest-acl/2026.acl-demo.15/) (Chamma et al., ACL 2026)
ACL