@inproceedings{p-mahalingam-2025-gemma,
title = "The Gemma Sutras: Fine-Tuning Gemma 3 for {S}anskrit Sandhi Splitting",
author = "P, Samarth and
Mahalingam, Sanjay Balaji",
editor = "Zhang, Chen and
Allaway, Emily and
Shen, Hua and
Miculicich, Lesly and
Li, Yinqiao and
M'hamdi, Meryem and
Limkonchotiwat, Peerat and
Bai, Richard He and
T.y.s.s., Santosh and
Han, Sophia Simeng and
Thapa, Surendrabikram and
Rim, Wiem Ben",
booktitle = "Proceedings of the 9th Widening NLP Workshop",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.winlp-main.35/",
doi = "10.18653/v1/2025.winlp-main.35",
pages = "235--241",
ISBN = "979-8-89176-351-7",
abstract = "Sandhi, the phonological merging of morphemes, is a central feature of Sanskrit grammar. While Sandhi formation is well-defined by P{\={a}}ṇini{'}s Aṣṭ{\={a}}dhy{\={a}}y{\={i}}, the reverse task{---}Sandhi splitting{---}is substantially more complex due to inherent ambiguity and context-sensitive transformations. Accurate splitting is a critical precursor to tokenization in Sanskrit, which lacks explicit word boundaries and presents densely fused compounds. In this work, we present a data-driven approach, fine-tuning the Gemma-3 4B large language model on a dataset of over 49,000 training and 2,000 test examples of compound words and their morpheme-level decompositions. Leveraging the Unsloth framework with low-rank adaptation (LoRA) and 4-bit quantization, we train the model to predict these splits. Our work yields a scalable, Sandhi-aware system designed to enhance modern NLP pipelines for classical Sanskrit, demonstrating an effective application of LLMs to this linguistic challenge."
}Markdown (Informal)
[The Gemma Sutras: Fine-Tuning Gemma 3 for Sanskrit Sandhi Splitting](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.winlp-main.35/) (P & Mahalingam, WiNLP 2025)
ACL