@inproceedings{qharabagh-etal-2025-fast,
title = "Fast, Not Fancy: Rethinking {G}2{P} with Rich Data and Statistical Models",
author = "Qharabagh, Mahta Fetrat and
Dehghanian, Zahra and
Rabiee, Hamid R.",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.1218/",
doi = "10.18653/v1/2025.findings-emnlp.1218",
pages = "22382--22408",
ISBN = "979-8-89176-335-7",
abstract = "Homograph disambiguation remains a significant challenge in grapheme-to-phoneme (G2P) conversion, especially for low-resource languages. This challenge is twofold: (1) creating balanced and comprehensive homograph datasets is labor-intensive and costly, and (2) specific disambiguation strategies introduce additional latency, making them unsuitable for real-time applications such as screen readers and other accessibility tools. In this paper, we address both issues. First, we propose a semi-automated pipeline for constructing homograph-focused datasets, introduce the HomoRich dataset generated through this pipeline, and demonstrate its effectiveness by applying it to enhance a state-of-the-art deep learning-based G2P system for Persian. Second, we advocate for a paradigm shift{---}utilizing rich offline datasets to inform the development of fast, statistical methods suitable for latency-sensitive accessibility applications like screen readers. To this end, we improve one of the most well-known rule-based G2P systems, eSpeak, into a fast homograph-aware version, HomoFast eSpeak. Our results show an approximate 30 percentage-point improvement in homograph disambiguation accuracy for the deep learning-based and eSpeak systems."
}Markdown (Informal)
[Fast, Not Fancy: Rethinking G2P with Rich Data and Statistical Models](https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.1218/) (Qharabagh et al., Findings 2025)
ACL