@article{benton-etal-2026-mining,
title = "Mining Naturally {R}omanized Seed Corpora without Romanizations",
author = "Benton, Adrian and
Gutkin, Alexander and
Kirov, Christo and
Roark, Brian",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.234/",
pages = "2996--3012",
abstract = "While the Latin script is used informally by speakers of many languages with different native scripts, high quality Latin script corpora for such languages that reflect actual natural romanizations are scarce and often difficult to collect. In this work, we propose a method for mining romanized language corpora in languages for which we do not have any pre-existing samples of naturally romanized text, focusing on Tigrinya as a test case. First we examine the efficacy of learning romanizations for a language based on observed romanizations in other languages that use the same native script. We then extrinsically assess such methods by using a romanization model trained on Amharic data to bootstrap coverage of romanized Tigrinya in a language identification system. Manual evaluation by two L1 and one L2 Tigrinya speakers suggests our method extracts romanized Tigrinya text with acceptably high precision."
}Markdown (Informal)
[Mining Naturally Romanized Seed Corpora without Romanizations](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.234/) (Benton et al., LREC 2026)
ACL