@inproceedings{de-luca-2025-accessible,
title = "Accessible {S}anskrit: A Cascading System for Text Analysis and Dictionary Access",
author = "De Luca, Giacomo",
editor = "Anderson, Adam and
Gordin, Shai and
Li, Bin and
Liu, Yudong and
Passarotti, Marco C. and
Sprugnoli, Rachele",
booktitle = "Proceedings of the Second Workshop on Ancient Language Processing",
month = may,
year = "2025",
address = "The Albuquerque Convention Center, Laguna",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.alp-1.5/",
pages = "38--46",
ISBN = "979-8-89176-235-0",
abstract = "Sanskrit text processing presents unique com-putational challenges due to its complex mor-phology, frequent compound formation, and the phenomenon of Sandhi. While several ap-proaches to Sanskrit word segmentation ex-ist, the field lacks integrated tools that make texts accessible while maintaining high accu-racy. We present a hybrid approach combining rule-based and statistical methods that achieves reliable Sanskrit text analysis through a cascade mechanism in which a deterministic matching using inflection tables is used for simple cases and statistical approaches are used for the more complex ones. The goal of the system is to provide automatic text annotation and inflected dictionary search, returning for each word root forms, comprehensive grammatical analysis, inflection tables, and dictionary entries from multiple sources. The system is evaluated on 300 randomly selected compounds from the GRETIL corpus across different length cate-gories and maintains 90{\%} accuracy regardless of compound length, with 91{\%} accuracy on the 40+ characters long compounds. The approach is also tested on the complete text of the Yoga Sutra, demonstrating 96{\%} accuracy in the prac-tical use case. This approach is implemented both as an open-source Python library and a web application, making Sanskrit text analysis accessible to scholars and interested readers while retaining state-of-the-art accuracy."
}
Markdown (Informal)
[Accessible Sanskrit: A Cascading System for Text Analysis and Dictionary Access](https://preview.aclanthology.org/fix-sig-urls/2025.alp-1.5/) (De Luca, ALP 2025)
ACL