@inproceedings{xiao-zhao-2025-b,
title = "From A and {B} to {A}+{B}: Can Large Language Models Solve Compositional Math Problems?",
author = "Xiao, Xisheng and
Zhao, Hanlin",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.660/",
pages = "13068--13089",
ISBN = "979-8-89176-332-6",
abstract = "Large language models (LLMs) have demonstrated strong performance in solving math problems, and there is growing research on evaluating their robustness. Unlike previous studies that create problem variants by adding perturbations to a single problem, this paper focuses on the interaction between problems. Specifically, we combine two original problems with a logical connection to get a new math problem, and measure the LLMs' performance on it to evaluate its compositional generalization, which is an important and essential reasoning capability in human intelligence. The result of experiments that cover 14 different LLMs shows that even when the mathematical essence remains unchanged, a simple form of combination can significantly reduce the performance of LLMs, revealing the limitation of their generalization ability. Additionally, we propose an automated pipeline with 98.2{\%} accuracy to assist in annotating datasets (1 manual, 2 synthetic). The extensive experiments conducted on these datasets further verify the conclusion and obtain some important findings. Finally, we analyze the impact of factors such as difficulty and length on LLMs' performance, offering insights for future research."
}Markdown (Informal)
[From A and B to A+B: Can Large Language Models Solve Compositional Math Problems?](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.660/) (Xiao & Zhao, EMNLP 2025)
ACL