@inproceedings{goel-sadat-2025-studying,
title = "Studying the Effect of {H}indi Tokenizer Performance on Downstream Tasks",
author = "Goel, Rashi and
Sadat, Fatiha",
editor = "Weerasinghe, Ruvan and
Anuradha, Isuri and
Sumanathilaka, Deshan",
booktitle = "Proceedings of the First Workshop on Natural Language Processing for Indo-Aryan and Dravidian Languages",
month = jan,
year = "2025",
address = "Abu Dhabi",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest_wac_2008/2025.indonlp-1.5/",
pages = "44--49",
abstract = "This paper deals with a study on the effect of training data size and tokenizer performance for Hindi language on the eventual downstream model performance and comprehension. Multiple monolingual Hindi tokenizers are trained for large language models such as BERT and intrinsic and extrinsic evaluations are performed on multiple Hindi datasets. The objective of this study is to understand the precise effects of tokenizer performance on downstream task performance to gain insight on how to develop better models for low-resource languages."
}
Markdown (Informal)
[Studying the Effect of Hindi Tokenizer Performance on Downstream Tasks](https://preview.aclanthology.org/ingest_wac_2008/2025.indonlp-1.5/) (Goel & Sadat, IndoNLP 2025)
ACL