@inproceedings{mercado-campos-etal-2026-power,
title = "The Power of Simplicity: N-Grams and Transformers in {N}ahuatl Language Identification",
author = "Mercado Campos, Luis and
Pugh, Robert and
Palmer, Alexis",
editor = "Mager, Manuel and
Ebrahimi, Abteen and
Bui, Minh Duc and
Pugh, Robert and
Oncevay, Arturo and
Chiruzzo, Luis and
Solano, Rolando Coto and
Rijhwani, Shruti and
Von Der Wense, Katharina",
booktitle = "Proceedings of the Sixth Workshop on {NLP} for Indigenous Languages of the {A}mericas ({A}mericas{NLP})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.americasnlp-6.14/",
pages = "153--167",
ISBN = "979-8-89176-415-6",
abstract = "In the context of real-world language technology applications, the language or variety in which a given text is written is often unknown or uncertain. Yet, this information is crucial in order to adequately select and apply appropriate models or resources. Language identification (LID), or the process of determining the language or variety of a text sample, is thus often an important fundamental task in natural language processing. LID can be particularly challenging when: (1) there are not many labeled texts for training; and (2) similar or related languages are involved, since these may share a number of surface-level features. In this paper, we present an LID system for Nahuatl, a group of closely-related language varieties spoken in Mexico and Central America. Nahuatl LID involves both of the aforementioned challenges: Nahuatl varieties can be quite similar, sharing morphemes and even many lexical items, and there is a relative paucity of representative, variant-labeled Nahuatl text. We describe LID experiments for a total of 11 Nahuatl varieties, achieving generally good results (90.59{\%} {\ensuremath{\pm}}0.09{\%} in 5-fold cross-validation experiments). Many of the outstanding errors are the result of confusion between three highly similar Huasteca variants."
}Markdown (Informal)
[The Power of Simplicity: N-Grams and Transformers in Nahuatl Language Identification](https://preview.aclanthology.org/ingest-acl-workshops/2026.americasnlp-6.14/) (Mercado Campos et al., AmericasNLP 2026)
ACL