@inproceedings{feng-rudzicz-2025-causallink,
title = "{C}ausal{L}ink: An Interactive Evaluation Framework for Causal Reasoning",
author = "Feng, Jinyue and
Rudzicz, Frank",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.findings-acl.1147/",
pages = "22313--22326",
ISBN = "979-8-89176-256-5",
abstract = "We present CausalLink, an innovative evaluation framework that interactively assesses thecausal reasoning skill to identify the correct intervention in conversational language models. Each CausalLink test case creates a hypothetical environment in which the language models are instructed to apply interventions to entities whose interactions follow predefined causal relations generated from controllable causal graphs. Our evaluation framework isolates causal capabilities from the confounding effects of world knowledge and semantic cues. We evaluate a series of LLMs in a scenario featuring movements of geometric shapes and discover that models start to exhibit reliable reasoning on two or three variables at the 14-billion-parameter scale. However, the performance of state-of-the-art models such as GPT4o degrades below random chance as the number of variables increases. We identify and analyze several key failure modes."
}
Markdown (Informal)
[CausalLink: An Interactive Evaluation Framework for Causal Reasoning](https://preview.aclanthology.org/landing_page/2025.findings-acl.1147/) (Feng & Rudzicz, Findings 2025)
ACL