@inproceedings{wan-etal-2026-speech, title = "Speech-Hands: A Self-Reflection Voice Agentic Approach to Speech Recognition and Audio Reasoning with Omni Perception", author = "Wan, Zhen and Yang, Chao-Han Huck and Tian, Jinchuan and Ye, Hanrong and Pasad, Ankita and Fu, Szu-Wei and Goel, Arushi and Hachiuma, Ryo and Diao, Shizhe and Dhawan, Kunal and Ghosh, Sreyan and Hirota, Yusuke and Chen, Zhehuai and Valle, Rafael and Chu, Chenhui and Watanabe, Shinji and Ginsburg, Boris and Wang, Yu-Chiang Frank", editor = "Liakata, Maria and Moreira, Viviane P. and Zhang, Jiajun and Jurgens, David", booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)", month = jul, year = "2026", address = "San Diego, California, United States", publisher = "Association for Computational Linguistics", url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1997/", pages = "43124--43142", ISBN = "979-8-89176-390-6" }