@inproceedings{han-gardent-2025-generating-questions,
title = "Generating Questions Under Discussion with Reinforcement Learning using Ranking and Scoring for Reward and Evaluation",
author = "Han, Kelvin and
Gardent, Claire",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.35/",
pages = "589--615",
ISBN = "979-8-89176-303-6",
abstract = "There is growing research interest in Questions Under Discussion (QUD), a linguistic framework for representing discourse in the form of natural language question-answer pairs, which are more easily understandable and have been found useful in several applications. Our goal in this work is to improve on the quality of automatic QUD generation. As a way to sidestep the paucity of data currently, we propose a reinforcement learning-based approach using the Group Relative Policy Optimisation (GRPO) objective for LLM post-training on the task. To get there, we: (i) carefully investigated five promising methods for reference-free automatic QUD evaluation, (ii) proposed a novel prompting strategy, SCRS, involving ranking and scoring with structured outputs that enables QUD evaluation close to the human upperbound, (iii) leveraged findings from (i) with (ii) for the knowledge distillation from a very large LLM to obtain a more resource-efficient reward model, and which (iv) we then used in the GRPO post-training for 3B LLMs on the QUD generation task. Our QUD generators give overall higher-quality QUDs compared to the SOTA which is based on supervised fine-tuning; all of these are achieved using only three annotated exemplars in the few-shot prompting for evaluation, and without the use of any other annotated questions for training the QUD generators. Our code, models, and annotated examples can be found at https://github.com/hankelvin/grpo{\_}qud{\_}generation."
}Markdown (Informal)
[Generating Questions Under Discussion with Reinforcement Learning using Ranking and Scoring for Reward and Evaluation](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.35/) (Han & Gardent, Findings 2025)
ACL