@inproceedings{tsukagoshi-ohmukai-2025-automatic,
title = "Automatic Accent Restoration in {V}edic {S}anskrit with Neural Language Models",
author = "Tsukagoshi, Yuzuki and
Ohmukai, Ikki",
editor = "Bhattacharya, Arnab and
Goyal, Pawan and
Ghosh, Saptarshi and
Ghosh, Kripabandhu",
booktitle = "Proceedings of the 1st Workshop on Benchmarks, Harmonization, Annotation, and Standardization for Human-Centric AI in Indian Languages (BHASHA 2025)",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.bhasha-1.7/",
pages = "83--90",
ISBN = "979-8-89176-313-5",
abstract = "Vedic Sanskrit, the oldest attested form of Sanskrit, employs a distinctive pitch-accent system that marks one syllable per word. This work presents the first application of large language models to the automatic restoration of accent marks in transliterated Vedic Sanskrit texts. A comprehensive corpus was assembled by extracting major Vedic works from the TITUS project and constructing paired samples of unaccented input and correctly accented references, yielding more than 100,000 training examples. Three generative LLMs were fine-tuned on this corpus: a LoRA-adapted Llama 3.1 8B Instruct model, OpenAI GPT{-}4.1 nano, and Google Gemini 2.5 Flash. These models were trained in a sequence{-}to{-}sequence fashion to insert accent marks at appropriate positions. Evaluation on roughly 2,000 sentences using precision, recall, F1, character error rate, word error rate, and ChrF1 metrics shows that fine{-}tuned models substantially outperform their untuned baselines. The LoRA-tuned Llama achieves the highest F1, followed by Gemini 2.5 Flash and GPT{-}4.1 nano. Error analysis reveals that the models learn to infer accents from grammatical and phonological context. These results demonstrate that LLMs can capture complex accentual patterns and recover lost information, opening possibilities for improved sandhi splitting, morphological analysis, syntactic parsing and machine translation in Vedic NLP pipelines."
}Markdown (Informal)
[Automatic Accent Restoration in Vedic Sanskrit with Neural Language Models](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.bhasha-1.7/) (Tsukagoshi & Ohmukai, BHASHA 2025)
ACL