@inproceedings{yadav-shrivastava-2025-a3,
title = "A3-108 at {BHASHA} Task1: Asymmetric {BPE} configuration for Grammar Error Correction",
author = "Yadav, Saumitra and
Shrivastava, Manish",
editor = "Bhattacharya, Arnab and
Goyal, Pawan and
Ghosh, Saptarshi and
Ghosh, Kripabandhu",
booktitle = "Proceedings of the 1st Workshop on Benchmarks, Harmonization, Annotation, and Standardization for Human-Centric AI in Indian Languages (BHASHA 2025)",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.bhasha-1.15/",
pages = "147--154",
ISBN = "979-8-89176-313-5",
abstract = "This paper presents our approach to Grammatical Error Correction (GEC) for five low-resource Indic languages, a task severely limited by a scarcity of annotated data. Our core methodology involves two stages: synthetic data generation and model optimization. First, we leverage the provided training data to build a Statistical Machine Translation (SMT) system, which is then used to generate large-scale synthetic noisy-to-clean parallel data from available monolingual text. This artificially corrupted data significantly enhances model robustness. Second, we train Transformer-based sequence-to-sequence models using an asymmetric and symmetric Byte Pair Encoding (BPE) configuration, where the number of merge operations differs between the source (erroneous) and target (corrected) sides to better capture language-specific characteristics. For instance, source BPE sizes 4000, 8000 and 16000, with target sizes at 500, 1000, 2000, 3000 and 4000. Our experiments demonstrated competitive performance across all five languages, with the best results achieving a GLUE score of 94.16 for Malayalam (Rank 4th) followed by Bangla at 92.44 (ranked 5th), Tamil at 85.52 (ranked 5th), Telugu at 81.9 (7th), and Hindi at 79.45(10th) in the shared task. These findings substantiate the effectiveness of combining SMT-based synthetic data generation with asymmetric BPE configurations for low-resource GEC."
}Markdown (Informal)
[A3-108 at BHASHA Task1: Asymmetric BPE configuration for Grammar Error Correction](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.bhasha-1.15/) (Yadav & Shrivastava, BHASHA 2025)
ACL