@inproceedings{brierley-etal-2012-open,
title = "Open-Source Boundary-Annotated Corpus for {A}rabic Speech and Language Processing",
author = "Brierley, Claire and
Sawalha, Majdi and
Atwell, Eric",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Do{\u{g}}an, Mehmet U{\u{g}}ur and
Maegaard, Bente and
Mariani, Joseph and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Eighth International Conference on Language Resources and Evaluation ({LREC}`12)",
month = may,
year = "2012",
address = "Istanbul, Turkey",
publisher = "European Language Resources Association (ELRA)",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/L12-1092/",
pages = "1011--1016",
abstract = "A boundary-annotated and part-of-speech tagged corpus is a prerequisite for developing phrase break classifiers. Boundary annotations in English speech corpora are descriptive, delimiting intonation units perceived by the listener. We take a novel approach to phrase break prediction for Arabic, deriving our prosodic annotation scheme from Tajw{\={i}}d (recitation) mark-up in the Qur`an which we then interpret as additional text-based data for computational analysis. This mark-up is prescriptive, and signifies a widely-used recitation style, and one of seven original styles of transmission. Here we report on version 1.0 of our Boundary-Annotated Qur`an dataset of 77430 words and 8230 sentences, where each word is tagged with prosodic and syntactic information at two coarse-grained levels. In (Sawalha et al., 2012), we use the dataset in phrase break prediction experiments. This research is part of a larger-scale project to produce annotation schemes, language resources, algorithms, and applications for Classical and Modern Standard Arabic."
}
Markdown (Informal)
[Open-Source Boundary-Annotated Corpus for Arabic Speech and Language Processing](https://preview.aclanthology.org/jlcl-multiple-ingestion/L12-1092/) (Brierley et al., LREC 2012)
ACL