@inproceedings{bandarkar-peng-2025-unreasonable,
title = "The Unreasonable Effectiveness of Model Merging for Cross-Lingual Transfer in {LLM}s",
author = "Bandarkar, Lucas and
Peng, Nanyun",
editor = "Adelani, David Ifeoluwa and
Arnett, Catherine and
Ataman, Duygu and
Chang, Tyler A. and
Gonen, Hila and
Raja, Rahul and
Schmidt, Fabian and
Stap, David and
Wang, Jiayi",
booktitle = "Proceedings of the 5th Workshop on Multilingual Representation Learning (MRL 2025)",
month = nov,
year = "2025",
address = "Suzhuo, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.mrl-main.10/",
doi = "10.18653/v1/2025.mrl-main.10",
pages = "131--148",
ISBN = "979-8-89176-345-6",
abstract = "Large language models (LLMs) still struggle across tasks outside of high-resource languages. In this work, we investigate cross-lingual transfer to lower-resource languages where task-specific post-training data is scarce. Building on prior work, we first validate that the subsets of model parameters that matter most for mathematical reasoning and multilingual capabilities are distinctly non-overlapping. To exploit this implicit separability between task and target language parameterization, we develop and analyze numerous modular frameworks to improve the composition of the two during fine-tuning. These methods generally employ freezing parameters or post hoc model merging to assign math and language improvement to different key parts of the LLM. In the absence of in-language math data, we demonstrate that the modular approaches successfully improve upon baselines across three languages, four models, and two fine-tuning paradigms (full and LoRA). Furthermore, we identify the most consistently successful modular method to be fine-tuning separate language and math experts and model merging via Layer-Swapping, somewhat surprisingly. We offer possible explanations for this result via recent works on the linearity of task vectors. We further explain this by empirically showing that reverting less useful fine-tuning updates after training often outperforms freezing them from the start."
}Markdown (Informal)
[The Unreasonable Effectiveness of Model Merging for Cross-Lingual Transfer in LLMs](https://preview.aclanthology.org/name-variant-enfa-fane/2025.mrl-main.10/) (Bandarkar & Peng, MRL 2025)
ACL