@inproceedings{xia-etal-2026-mmscicode,
title = "{MMS}ci{C}ode: Real-world Evaluation of Multilingual Multi-Discipline Scientific Research Coding",
author = "Xia, Xue and
Yang, Zheyuan and
Cohan, Arman and
Zhao, Yilun",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1566/",
pages = "33981--33999",
ISBN = "979-8-89176-390-6",
abstract = "We introduce MMSciCode, a comprehensive expert-level, multilingual multi-discipline benchmark for evaluating foundation models in scientific code generation. It includes 624 expert-annotated research coding problems spanning six core scientific disciplines. Compared to prior benchmarks, MMSciCode features three key advancements. First, it challenges models to integrate domain-specific knowledge with algorithmic reasoning to implement core functions from research papers, moving beyond the isolated, general-purpose coding tasks typically assessed in current benchmarks. Second, each problem is meticulously annotated by domain experts through a rigorous paper-grounded process, with strict quality controls implemented to ensure dataset integrity and authenticity. Finally, each problem is equipped with comprehensive unit test suites and containerized environments, enabling reproducible and diagnostic evaluation of both functional correctness and domain validity. We conduct an extensive evaluation of 28 state-of-the-art foundation models and 2 agentic coding tools on MMSciCode. Our results reveal that even the best non-agentic model achieves only around 15{\%} accuracy, while the top agentic coding tool reaches 32.2{\%}, both still far below human expert performance of 68.8{\%}. Through comprehensive error analyses and case studies, we identify substantial performance gaps between models and human experts, providing actionable insights for advancing expert-level scientific code generation."
}Markdown (Informal)
[MMSciCode: Real-world Evaluation of Multilingual Multi-Discipline Scientific Research Coding](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1566/) (Xia et al., ACL 2026)
ACL