@inproceedings{boughorbel-etal-2025-beyond,
title = "Beyond the Leaderboard: Understanding Performance Disparities in Large Language Models via Model Diffing",
author = "Boughorbel, Sabri and
Dalvi, Fahim and
Durrani, Nadir and
Hawasly, Majd",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1598/",
pages = "31348--31359",
ISBN = "979-8-89176-332-6",
abstract = "As fine-tuning becomes the dominant paradigm for improving large language models (LLMs), understanding what changes during this process is increasingly important. Traditional benchmarking often fails to explain {\_}why{\_} one model outperforms another. In this work, we use model diffing, a mechanistic interpretability approach, to analyze the specific capability differences between Gemma-2-9b-it and a SimPO-enhanced variant. Using crosscoders, we identify and categorize latent representations that differentiate the two models. We find that SimPO acquired latent concepts predominantly enhance safety mechanisms (+32.8{\%}), multilingual capabilities (+43.8{\%}), and instruction-following (+151.7{\%}), while its additional training also reduces emphasis on model self-reference (-44.1{\%}) and hallucination management (-68.5{\%}). Our analysis shows that model diffing can yield fine-grained insights beyond leaderboard metrics, attributing performance gaps to concrete mechanistic capabilities. This approach offers a transparent and targeted framework for comparing LLMs."
}Markdown (Informal)
[Beyond the Leaderboard: Understanding Performance Disparities in Large Language Models via Model Diffing](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1598/) (Boughorbel et al., EMNLP 2025)
ACL