@inproceedings{sellmer-hellwig-2024-vedic,
title = "The {V}edic Compound Dataset",
author = "Sellmer, Sven and
Hellwig, Oliver",
editor = {Bhatia, Archna and
Bouma, Gosse and
Do{\u{g}}ru{\"o}z, A. Seza and
Evang, Kilian and
Garcia, Marcos and
Giouli, Voula and
Han, Lifeng and
Nivre, Joakim and
Rademaker, Alexandre},
booktitle = "Proceedings of the Joint Workshop on Multiword Expressions and Universal Dependencies (MWE-UD) @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.mwe-1.8/",
pages = "50--55",
abstract = "This paper introduces the Vedic Compound Dataset (VCD), the first resource providing annotated compounds from Vedic Sanskrit, a South Asian Indo-European language used from ca. 1500 to 500 BCE. The VCD aims at facilitating the study of language change in early Indo-Iranian and offers comparative material for quantitative cross-linguistic research on compounds. The process of annotating Vedic compounds is complex as they contain five of the six basic types of compounds defined by Scalise {\&} Bisetto (2005), which are, however, not consistently marked in morphosyntax, making their automatic classification a significant challenge. The paper details the process of collecting and preprocessing the relevant data, with a particular focus on the question of how to distinguish exocentric from endocentric usage. It further discusses experiments with a simple ML classifier that uses compound internal syntactic relations, outlines the composition of the dataset, and sketches directions for future research."
}
Markdown (Informal)
[The Vedic Compound Dataset](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.mwe-1.8/) (Sellmer & Hellwig, MWE-UDW 2024)
ACL
- Sven Sellmer and Oliver Hellwig. 2024. The Vedic Compound Dataset. In Proceedings of the Joint Workshop on Multiword Expressions and Universal Dependencies (MWE-UD) @ LREC-COLING 2024, pages 50–55, Torino, Italia. ELRA and ICCL.