@inproceedings{tang-etal-2026-dont,
title = "Don{'}t Just Listen, Try Planning: Graph-based Retrieval-Generation Agent for Long-form Audio Meeting Understanding",
author = "Tang, Quanwei and
Zhang, Dong and
Li, Shoushan and
Zhou, Guodong",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1038/",
pages = "20715--20742",
ISBN = "979-8-89176-395-1",
abstract = "Long-form audio meeting understanding (LAMU) is gaining attention, but dedicated question answering (QA) datasets are lacking. Previous tailored speech QA and existing Speech LLMs suffer from acoustic information loss and poor long-term dependency capture. We construct the LongAudioQA dataset and propose the GRGA model, which models heterogeneous audio features into a multi-dimensional graph and leverages agent planning for retrieval and answer generation, effectively addressing existing limitations."
}Markdown (Informal)
[Don’t Just Listen, Try Planning: Graph-based Retrieval-Generation Agent for Long-form Audio Meeting Understanding](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1038/) (Tang et al., Findings 2026)
ACL