@article{tsukagoshi-ohmukai-2026-aligned,
title = "Aligned Parallel Corpus of the {V}edic Saṁhit{\={a}}s for Machine Translation",
author = "Tsukagoshi, Yuzuki and
Ohmukai, Ikki",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.272/",
pages = "3434--3444",
abstract = "We introduce a verse-/paragraph-aligned parallel corpus for three Vedic Saṁhit{\={a}}s {--}the R̥gveda (R̥V), the Atharvaveda {\'S}aunaka (AV{\'S}), and the Taittir{\={i}}ya Saṁhit{\={a}} (TS){--} paired with authoritative public-domain translations (Geldner for R̥V, Whitney for AV{\'S}, and Keith for TS). The source texts are drawn from established digital editions (e.g., TITUS and VedaWeb) and normalized under ISO 15919. Each Sanskrit segment is aligned to exactly one translated unit (verse or paragraph for TS prose), yielding a unified, model-ready format. Using this resource, we fine-tune and evaluate three large language models {--}GPT-4.1 nano, Gemini 2.5 Flash, and Mitra{--} on Vedic$\to$German/English translation. Evaluation combines surface and semantic metrics (case-insensitive sacreBLEU and COMET), enabling a balanced assessment of form and meaning. Results show consistent in-domain gains after supervised fine-tuning, but substantial cross-domain degradation when models are tested on unseen Saṁhit{\={a}}s, indicating pronounced stylistic and lexical divergence among R̥V, AV{\'S}, and TS. These findings motivate domain-aware training and reporting practices for Vedic machine translation. We release the corpus with standardized splits and preprocessing to support reproducibility and future d research on historical language modeling, alignment, and translation for low-resource ancient languages."
}Markdown (Informal)
[Aligned Parallel Corpus of the Vedic Saṁhitās for Machine Translation](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.272/) (Tsukagoshi & Ohmukai, LREC 2026)
ACL