@inproceedings{javaji-zhu-2025-ask,
title = "What Would You Ask When You First Saw $a^2+b^2=c^2$? Evaluating {LLM} on Curiosity-Driven Question Generation",
author = "Javaji, Shashidhar Reddy and
Zhu, Zining",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.126/",
pages = "2322--2354",
ISBN = "979-8-89176-298-5",
abstract = "Large language models (LLMs) are increasingly widely used as critical components of knowledge retrieval systems and agentic systems. These systems can benefit from knowledge-seeking capabilities of LLMs, in other words, curiosity. However, this capability has not been evaluated quantitatively. Towards bridging this gap, we propose an evaluation framework, CDQG (Curiosity-Driven Question Generation). The CDQG task prompts LLMs to generate questions about a statement introducing scientific knowledge, simulating a curious person when facing the statement for the first time. The CDQG dataset contains 1,988 statements including physics, chemistry, and mathematics with distinct levels of difficulty, general knowledge statements, and intentionally erroneous statements. We score the qualities of the questions generated by LLMs along multiple dimensions. These scores are validated by rigorous controlled ablation studies and human evaluations. While large models like GPT-4 and Mistral 8x7b can generate highly coherent and relevant questions, the smaller Phi-2 model is equally or more effective. This indicates that size does not solely determine a model{'}s knowledge acquisition potential. CDQG quantifies a critical model capability, and opens up research opportunities for developing future knowledge retrieval systems driven by LLMs."
}Markdown (Informal)
[What Would You Ask When You First Saw a2+b2=c2? Evaluating LLM on Curiosity-Driven Question Generation](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.126/) (Javaji & Zhu, IJCNLP-AACL 2025)
ACL