@inproceedings{zahraei-asgari-2024-turingq,
title = "{T}uring{Q}: Benchmarking {AI} Comprehension in Theory of Computation",
author = "Zahraei, Pardis Sadat and
Asgari, Ehsaneddin",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2024.findings-emnlp.715/",
doi = "10.18653/v1/2024.findings-emnlp.715",
pages = "12267--12280",
abstract = "We present TuringQ, the first benchmark designed to evaluate the reasoning capabilities of large language models (LLMs) in the theory of computation. TuringQ consists of 4,006 undergraduate and graduate-level question-answer pairs, categorized into four difficulty levels and covering seven core theoretical areas. We evaluate several open-source LLMs, as well as GPT-4, using Chain of Thought prompting and expert human assessment. Additionally, we propose an automated LLM-based evaluation system that demonstrates competitive accuracy when compared to human evaluation. Fine-tuning a Llama3-8B model on TuringQ shows measurable improvements in reasoning ability and out-of-domain tasks such as algebra. TuringQ serves as both a benchmark and a resource for enhancing LLM performance in complex computational reasoning tasks. Our analysis offers insights into LLM capabilities and advances in AI comprehension of theoretical computer science."
}
Markdown (Informal)
[TuringQ: Benchmarking AI Comprehension in Theory of Computation](https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2024.findings-emnlp.715/) (Zahraei & Asgari, Findings 2024)
ACL