@inproceedings{behera-saluja-2025-hilearners,
title = "{H}i{L}earners: Non-Native Spoken {H}indi Error Correction",
author = "Behera, Sourava Kumar and
Saluja, Rohit",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.78/",
pages = "1276--1288",
ISBN = "979-8-89176-303-6",
abstract = "While majority of current resources rely on formal text corrections, our work shifts the focus to non-native spoken Hindi error correction, which presents unique challenges due to its rich morphology, complex syntax, and distinct error patterns. To address the scarcity of authentic learner data, we introduce HiLearners, a dataset gathered from 2,500 real non-native Hindi speakers across three linguistic backgrounds (English, Bengali, Dravidian), capturing authentic error patterns including transfer errors, overgeneralization patterns, and contextual agreement issues. Furthermore, to overcome data resource limitations, we develop a methodical synthetic data augmentation technique, utilizing Large Language Models (LLMs) with a pattern analysis and controlled generation approach similar to Retrieval-Augmented Generation (RAG), yielding 5,500 carefully verified synthetic examples. Through extensive experiments on individual, mixed, and progressive curriculum-based configurations using multilingual models, we demonstrate that LLM-based synthetic data combined with three-phase curriculum learning significantly boosts performance, achieving a 76.92 GLEU score and surpassing human-only baselines. This work bridges the gap between native-centric error correction research and non-native Hindi learner needs, establishing a realistic assessment standard for advancing low-resource language processing."
}