@inproceedings{wang-zhao-2024-tram,
title = "{TRAM}: Benchmarking Temporal Reasoning for Large Language Models",
author = "Wang, Yuqing and
Zhao, Yun",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-acl.382/",
doi = "10.18653/v1/2024.findings-acl.382",
pages = "6389--6415",
abstract = "Reasoning about time is essential for understanding the nuances of events described in natural language. Previous research on this topic has been limited in scope, characterized by a lack of standardized benchmarks that would allow for consistent evaluations across different studies. In this paper, we introduce TRAM, a temporal reasoning benchmark composed of ten datasets, encompassing various temporal aspects of events such as order, arithmetic, frequency, and duration, designed to facilitate a comprehensive evaluation of the TeR capabilities of large language models (LLMs). We evaluate popular LLMs like GPT-4 and Llama2 in zero-shot and few-shot scenarios, and establish baselines with BERT-based and domain-specific models. Our findings indicate that the best-performing model lags significantly behind human performance. It is our aspiration that TRAM will spur further progress in enhancing the TeR capabilities of LLMs."
}
Markdown (Informal)
[TRAM: Benchmarking Temporal Reasoning for Large Language Models](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-acl.382/) (Wang & Zhao, Findings 2024)
ACL