@inproceedings{zhang-etal-2025-xfinbench,
title = "{XF}in{B}ench: Benchmarking {LLM}s in Complex Financial Problem Solving and Reasoning",
author = "Zhang, Zhihan and
Cao, Yixin and
Liao, Lizi",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.findings-acl.457/",
pages = "8715--8758",
ISBN = "979-8-89176-256-5",
abstract = "Solving financial problems demands complex reasoning, multimodal data processing, and a broad technical understanding, presenting unique challenges for current large language models (LLMs). We introduce **XFinBench**, a novel benchmark with 4,235 examples designed to evaluate LLM{'}s ability in solving comple**X**, knowledge-intensive **Fin**ancial problems across diverse graduate-level finance topics with multi-modal context. We identify five core capabilities of LLMs using XFinBench, i.e., {\_}terminology understanding{\_}, {\_}temporal reasoning{\_}, {\_}future forecasting{\_}, {\_}scenario planning{\_}, and {\_}numerical modelling{\_}. Upon XFinBench, we conduct extensive experiments on 18 leading models. The result shows that o1 is the best-performing text-only model with an overall accuracy of 67.3{\%}, but still lags significantly behind human experts with 12.5{\%}, especially in temporal reasoning and scenario planning capabilities. We further construct a knowledge bank with 3,032 finance terms for knowledge augmentation analysis, and find that relevant knowledge to the question only brings consistent accuracy improvements to small open-source model. Additionally, our error analysis reveals that rounding errors during calculation and blindness to position and intersection of curves in the image are two primary issues leading to model{'}s poor performance in calculating and visual-context questions, respectively."
}
Markdown (Informal)
[XFinBench: Benchmarking LLMs in Complex Financial Problem Solving and Reasoning](https://preview.aclanthology.org/landing_page/2025.findings-acl.457/) (Zhang et al., Findings 2025)
ACL