@inproceedings{muthukumar-etal-2026-tamiltok,
title = "{T}amil{T}ok: Morphologically-Informed Tokenization for {T}amil",
author = "Muthukumar, Surendhar and
Herygers, Aaricia and
Beinborn, Lisa",
editor = "Chakravarthi, Bharathi Raja and
Priyadharshini, Ruba and
Madasamy, Anand Kumar and
Thavareesan, Sajeetha and
Rajiakodi, Saranya and
Navaneethakrishnan, Subalalitha and
Chinnappa, Dhivya and
Palani, Balasubramanian and
Subramanian, Malliga and
Shanmugavadivel, Kogilavani and
Rajalakshmi, Ratnavel",
booktitle = "Proceedings of the Sixth Workshop on Speech, Vision, and Language Technologies for {D}ravidian Languages",
month = jul,
year = "2026",
address = "Underline (Virtual)",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.dravidianlangtech-1.7/",
pages = "52--61",
ISBN = "979-8-89176-401-9",
abstract = "Tokenization is fundamental to neural language modeling, yet for Tamil it remains largely adapted from general-purpose multilingual models without systematic consideration of the rich agglutinative morphology. We introduce TamilMorph, a large-scale dataset of more than 480,000 morphologically segmented Tamil word forms. Building on this new resource, we develop TamilTok, a morphology-aware tokenization framework that incorporates explicit morpheme structure into tokenizer training. We benchmark Tamil tokenization quality across multiple tokenization algorithms and vocabulary configurations and find that our approach improves both morphological alignment and downstream performance compared to previous approaches. Our morphological resource for Tamil and our systematic empirical analyses can guide future developments of tokenization for morphologically rich languages."
}Markdown (Informal)
[TamilTok: Morphologically-Informed Tokenization for Tamil](https://preview.aclanthology.org/ingest-acl-workshops/2026.dravidianlangtech-1.7/) (Muthukumar et al., DravidianLangTech 2026)
ACL
- Surendhar Muthukumar, Aaricia Herygers, and Lisa Beinborn. 2026. TamilTok: Morphologically-Informed Tokenization for Tamil. In Proceedings of the Sixth Workshop on Speech, Vision, and Language Technologies for Dravidian Languages, pages 52–61, Underline (Virtual). Association for Computational Linguistics.