@article{hariharan-mortensen-2026-transformer,
title = "Transformer-Enabled Diachronic Analysis of {V}edic {S}anskrit: Neural Methods for Quantifying Types of Language Change",
author = "Hariharan, Ananth A. and
Mortensen, David R.",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.81/",
pages = "1044--1053",
abstract = "This study demonstrates how hybrid neural-symbolic methods can yield significant new insights into the evolution of a morphologically rich, low-resource language. We challenge the naive assumption that linguistic change is simplification by quantitatively analyzing over 2,000 years of Sanskrit, demonstrating how weakly-supervised hybrid methods can yield new insights into the evolution of morphologically rich, low-resource languages. Our approach addresses data scarcity through weak supervision, using 100+ high-precision regex patterns to generate pseudo-labels for fine-tuning a multilingual BERT. We then fuse symbolic and neural outputs via a novel confidence-weighted ensemble, creating a system that is both scalable and interpretable. Applying this framework to a 1.47-million-word diachronic corpus, our ensemble achieves a 52.4{\%} overall feature detection rate. Our findings reveal that Sanskrit{'}s overall morphological complexity does not decrease but is instead dynamically redistributed: while earlier verbal features show cyclical patterns of decline, complexity shifts to other domains, evidenced by a dramatic expansion in compounding and the emergence of new philosophical terminology. Critically, our system produces well-calibrated uncertainty estimates, with confidence strongly correlating with accuracy (Pearson r = 0.92) and low overall calibration error (ECE = 0.043), bolstering the reliability of these findings for computational philology."
}Markdown (Informal)
[Transformer-Enabled Diachronic Analysis of Vedic Sanskrit: Neural Methods for Quantifying Types of Language Change](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.81/) (Hariharan & Mortensen, LREC 2026)
ACL