@inproceedings{bhattacharyya-bhattacharya-2025-banglabyt5,
title = "{B}angla{B}y{T}5: Byte-Level Modelling for {B}angla",
author = "Bhattacharyya, Pramit and
Bhattacharya, Arnab",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.297/",
doi = "10.18653/v1/2025.findings-emnlp.297",
pages = "5551--5560",
ISBN = "979-8-89176-335-7",
abstract = "Large language models (LLMs) have achievedremarkable success across various natural lan-guage processing tasks. However, most LLMmodels use traditional tokenizers like BPE andSentencePiece, which fail to capture the finernuances of a morphologically rich languagelike Bangla (Bengali). In this work, we introduce BanglaByT5, the first byte-level encoder-decoder model explicitly tailored for Bangla.Built upon a small variant of Google{'}s ByT5architecture, BanglaByT5 is pre-trained on a14GB curated corpus combining high-qualityliterary and newspaper articles. Through zero-shot and supervised evaluations across gen-erative and classification tasks, BanglaByT5demonstrates competitive performance, surpassing several multilingual and larger models.Our findings highlight BanglaByT5{'}s potentialas a lightweight yet powerful tool for BanglaNLP, particularly in resource-constrained orscalable environments. BanglaByT5 is pub-licly available for download from https://huggingface.co/Vacaspati/BanglaByT5."
}Markdown (Informal)
[BanglaByT5: Byte-Level Modelling for Bangla](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.297/) (Bhattacharyya & Bhattacharya, Findings 2025)
ACL
- Pramit Bhattacharyya and Arnab Bhattacharya. 2025. BanglaByT5: Byte-Level Modelling for Bangla. In Findings of the Association for Computational Linguistics: EMNLP 2025, pages 5551–5560, Suzhou, China. Association for Computational Linguistics.