@inproceedings{kang-xiong-2025-researcharena,
title = "{R}esearch{A}rena: Benchmarking Large Language Models' Ability to Collect and Organize Information as Research Agents",
author = "Kang, Hao and
Xiong, Chenyan",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.303/",
doi = "10.18653/v1/2025.findings-emnlp.303",
pages = "5653--5671",
ISBN = "979-8-89176-335-7",
abstract = "Large language models (LLMs) excel across many natural language processing tasks but face challenges in domain-specific, analytical tasks such as conducting research surveys. This study introduces ResearchArena, a benchmark designed to evaluate LLMs' capabilities in conducting academic surveys{---}a foundational step in academic research. ResearchArena models the process in three stages: (1) information discovery, identifying relevant literature; (2) information selection, evaluating papers' relevance and impact; and (3) information organization, structuring knowledge into hierarchical frameworks such as mind-maps. Notably, mind-map construction is treated as a bonus task, reflecting its supplementary role in survey-writing. To support these evaluations, we construct an offline environment of 12M full-text academic papers and 7.9K survey papers. To ensure ethical compliance, we do not redistribute copyrighted materials; instead, we provide code to construct the environment from the Semantic Scholar Open Research Corpus (S2ORC). Preliminary evaluations reveal that LLM-based approaches underperform compared to simpler keyword-based retrieval methods, though recent reasoning models such as DeepSeek-R1 show slightly better zero-shot performance. These results underscore significant opportunities for advancing LLMs in autonomous research. We open-source the code to construct the ResearchArena benchmark at https://github.com/cxcscmu/ResearchArena."
}Markdown (Informal)
[ResearchArena: Benchmarking Large Language Models’ Ability to Collect and Organize Information as Research Agents](https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.303/) (Kang & Xiong, Findings 2025)
ACL