@inproceedings{son-etal-2025-linguistic,
title = "Linguistic Generalizability of Test-Time Scaling in Mathematical Reasoning",
author = "Son, Guijin and
Hong, Jiwoo and
Ko, Hyunwoo and
Thorne, James",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.699/",
pages = "14333--14368",
ISBN = "979-8-89176-251-0",
abstract = "Scaling pre-training compute has proven effective for achieving multilinguality, but does the same hold for test-time scaling? In this work, we introduce **MCLM**, a multilingual math benchmark featuring competition-level problems in 55 languages. We then compare three test-time scaling methods{---}Outcome Reward Modeling, Process Reward Modeling, and Budget Forcing. Our findings indicate that although ``thinking LLMs'' have recently garnered significant attention, their performance is comparable to traditional scaling methods like best-of-N once constrained to similar levels of inference FLOPs. More importantly, all tested methods fail to generalize robustly across languages, achieving only modest gains that are smaller than those observed in English, with no improvements in variance or consistency. To foster further research, we release MCLM and MR1-1.5B (a multilingual LLM with reasoning capabilities) and our evaluation results."
}
Markdown (Informal)
[Linguistic Generalizability of Test-Time Scaling in Mathematical Reasoning](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.699/) (Son et al., ACL 2025)
ACL