@inproceedings{k-h-2026-nlp,
title = "What {NLP} Gets Wrong About Contact: Implications for Field Linguistic Evidence",
author = "K H, Manodyna",
booktitle = "Proceedings of the Fifth Workshop on {NLP} Applications to Field Linguistics",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/manual-author-scripts/2026.fieldmatters-1.2/",
pages = "8--15",
abstract = "Field linguistics increasingly relies on computational tools to organize, analyze, and preserve linguistic data, yet the classificatory assumptions embedded in these tools are rarely examined. A pervasive assumption is that languages can be treated as discrete, genealogically defined units, with relatedness modeled as tree-structured descent. We argue that this assumption misrepresents linguistic evidence in contact-heavy regions and risks distorting the computational mediation of field linguistic data. Focusing on South Asia, we show that widely assumed boundaries{---}such as the Indo-Aryan{--}Dravidian divide{---}collapse in long-standing contact zones characterized by convergence, dialect continua, and institutional multilingualism. Through historically grounded case studies including Kannada{--}Telugu and Tamil{--}Malayalam, we demonstrate how convergence, script-mediated distance, and post-hoc standardization reshape how field data is segmented, compared, and interpreted when organized through genealogical labels. We argue that contact-aware, relational models of linguistic relatedness are necessary if NLP tools are to support, rather than distort, the documentation and analysis of linguistic diversity."
}Markdown (Informal)
[What NLP Gets Wrong About Contact: Implications for Field Linguistic Evidence](https://preview.aclanthology.org/manual-author-scripts/2026.fieldmatters-1.2/) (K H, FieldMatters 2026)
ACL