@inproceedings{wu-yarowsky-2021-pronunciations,
title = "On Pronunciations in {W}iktionary: Extraction and Experiments on Multilingual Syllabification and Stress Prediction",
author = "Wu, Winston and
Yarowsky, David",
booktitle = "Proceedings of the 14th Workshop on Building and Using Comparable Corpora (BUCC 2021)",
month = sep,
year = "2021",
address = "Online (Virtual Mode)",
publisher = "INCOMA Ltd.",
url = "https://aclanthology.org/2021.bucc-1.9",
pages = "68--74",
abstract = "We constructed parsers for five non-English editions of Wiktionary, which combined with pronunciations from the English edition, comprises over 5.3 million IPA pronunciations, the largest pronunciation lexicon of its kind. This dataset is a unique comparable corpus of IPA pronunciations annotated from multiple sources. We analyze the dataset, noting the presence of machine-generated pronunciations. We develop a novel visualization method to quantify syllabification. We experiment on the new combined task of multilingual IPA syllabification and stress prediction, finding that training a massively multilingual neural sequence-to-sequence model with copy attention can improve performance on both high- and low-resource languages, and multi-task training on stress prediction helps with syllabification.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wu-yarowsky-2021-pronunciations">
<titleInfo>
<title>On Pronunciations in Wiktionary: Extraction and Experiments on Multilingual Syllabification and Stress Prediction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Winston</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Yarowsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-sep</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th Workshop on Building and Using Comparable Corpora (BUCC 2021)</title>
</titleInfo>
<originInfo>
<publisher>INCOMA Ltd.</publisher>
<place>
<placeTerm type="text">Online (Virtual Mode)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We constructed parsers for five non-English editions of Wiktionary, which combined with pronunciations from the English edition, comprises over 5.3 million IPA pronunciations, the largest pronunciation lexicon of its kind. This dataset is a unique comparable corpus of IPA pronunciations annotated from multiple sources. We analyze the dataset, noting the presence of machine-generated pronunciations. We develop a novel visualization method to quantify syllabification. We experiment on the new combined task of multilingual IPA syllabification and stress prediction, finding that training a massively multilingual neural sequence-to-sequence model with copy attention can improve performance on both high- and low-resource languages, and multi-task training on stress prediction helps with syllabification.</abstract>
<identifier type="citekey">wu-yarowsky-2021-pronunciations</identifier>
<location>
<url>https://aclanthology.org/2021.bucc-1.9</url>
</location>
<part>
<date>2021-sep</date>
<extent unit="page">
<start>68</start>
<end>74</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T On Pronunciations in Wiktionary: Extraction and Experiments on Multilingual Syllabification and Stress Prediction
%A Wu, Winston
%A Yarowsky, David
%S Proceedings of the 14th Workshop on Building and Using Comparable Corpora (BUCC 2021)
%D 2021
%8 sep
%I INCOMA Ltd.
%C Online (Virtual Mode)
%F wu-yarowsky-2021-pronunciations
%X We constructed parsers for five non-English editions of Wiktionary, which combined with pronunciations from the English edition, comprises over 5.3 million IPA pronunciations, the largest pronunciation lexicon of its kind. This dataset is a unique comparable corpus of IPA pronunciations annotated from multiple sources. We analyze the dataset, noting the presence of machine-generated pronunciations. We develop a novel visualization method to quantify syllabification. We experiment on the new combined task of multilingual IPA syllabification and stress prediction, finding that training a massively multilingual neural sequence-to-sequence model with copy attention can improve performance on both high- and low-resource languages, and multi-task training on stress prediction helps with syllabification.
%U https://aclanthology.org/2021.bucc-1.9
%P 68-74
Markdown (Informal)
[On Pronunciations in Wiktionary: Extraction and Experiments on Multilingual Syllabification and Stress Prediction](https://aclanthology.org/2021.bucc-1.9) (Wu & Yarowsky, BUCC 2021)
ACL