@inproceedings{cao-etal-2025-sync,
title = "{SYNC}: A Synthetic Long-Context Understanding Benchmark for Controlled Comparisons of Model Capabilities",
author = "Cao, Shuyang and
Zou, Kaijian and
Wang, Lu",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1707/",
pages = "33615--33636",
ISBN = "979-8-89176-332-6",
abstract = "Recently, researchers have turned to synthetic tasks for evaluation of large language models' long-context capabilities, as they offer more flexibility than realistic benchmarks in scaling both input length and dataset size. However, existing synthetic tasks typically target narrow skill sets such as retrieving information from massive input, limiting their ability to comprehensively assess model capabilities. Furthermore, existing benchmarks often pair each task with a different input context, creating confounding factors that prevent fair cross-task comparison. To address these limitations, we introduce SYNC, a new evaluation suite of synthetic tasks spanning domains including graph understanding and translation. Each domain includes three tasks designed to test a wide range of capabilities{---}from retrieval, to multi-hop tracking, and to global context understanding that that requires chain-of-thought (CoT) reasoning. Crucially, all tasks share the same context, enabling controlled comparisons of model performance. We evaluate 14 LLMs on SYNC and observe substantial performance drops on more challenging tasks, underscoring the benchmark{'}s difficulty. Additional experiments highlight the necessity of CoT reasoning and demonstrate that poses a robust challenge for future models."
}Markdown (Informal)
[SYNC: A Synthetic Long-Context Understanding Benchmark for Controlled Comparisons of Model Capabilities](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1707/) (Cao et al., EMNLP 2025)
ACL