@article{shurtz-etal-2026-mekongphon,
title = "{M}ekong{P}hon: A Large-Scale Parallel {IPA} Corpus for {L}ao and {K}hmer",
author = "Shurtz, Ammon and
Richardson, Christian and
Richardson, Stephen D.",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.129/",
pages = "1650--1658",
abstract = "High-quality International Phonetic Alphabet (IPA) transcriptions are a foundational resource for speech and language technologies, yet existing tools for many low-resource languages remain limited in accuracy and scope. In this work, we present MekongPhon, a large-scale, high-quality parallel IPA corpus for Lao and Khmer. The corpus contains 1.3 million Khmer and 367 thousand Lao orthographic{--}IPA pairs, meticulously aligned and verified. When used to train Transformer-based sequence-to-sequence models, MekongPhon enables exceptionally accurate IPA generation, achieving under 2{\%} Character Error Rate (CER) on held-out test sets. We further introduce linguistically informed Lao and Khmer transliteration tools that offer high-speed IPA conversion, outperforming Epitran by 6-71 CER points despite trading some accuracy for efficiency. All data, code, and pretrained models are publicly released to support future research and development in low-resource language technologies."
}Markdown (Informal)
[MekongPhon: A Large-Scale Parallel IPA Corpus for Lao and Khmer](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.129/) (Shurtz et al., LREC 2026)
ACL