@inproceedings{ihori-etal-2020-parallel,
title = "Parallel Corpus for {J}apanese Spoken-to-Written Style Conversion",
author = "Ihori, Mana and
Takashima, Akihiko and
Masumura, Ryo",
booktitle = "Proceedings of the 12th Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.779",
pages = "6346--6353",
abstract = "With the increase of automatic speech recognition (ASR) applications, spoken-to-written style conversion that transforms spoken-style text into written-style text is becoming an important technology to increase the readability of ASR transcriptions. To establish such conversion technology, a parallel corpus of spoken-style text and written-style text is beneficial because it can be utilized for building end-to-end neural sequence transformation models. Spoken-to-written style conversion involves multiple conversion problems including punctuation restoration, disfluency detection, and simplification. However, most existing corpora tend to be made for just one of these conversion problems. In addition, in Japanese, we have to consider not only general spoken-to-written style conversion problems but also Japanese-specific ones, such as language style unification (e.g., polite, frank, and direct styles) and omitted postpositional particle expressions restoration. Therefore, we created a new Japanese parallel corpus of spoken-style text and written-style text that can simultaneously handle general problems and Japanese-specific ones. To make this corpus, we prepared four types of spoken-style text and utilized a crowdsourcing service for manually converting them into written-style text. This paper describes the building setup of this corpus and reports the baseline results of spoken-to-written style conversion using the latest neural sequence transformation models.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ihori-etal-2020-parallel">
<titleInfo>
<title>Parallel Corpus for Japanese Spoken-to-Written Style Conversion</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mana</namePart>
<namePart type="family">Ihori</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akihiko</namePart>
<namePart type="family">Takashima</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryo</namePart>
<namePart type="family">Masumura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Language Resources and Evaluation Conference</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>With the increase of automatic speech recognition (ASR) applications, spoken-to-written style conversion that transforms spoken-style text into written-style text is becoming an important technology to increase the readability of ASR transcriptions. To establish such conversion technology, a parallel corpus of spoken-style text and written-style text is beneficial because it can be utilized for building end-to-end neural sequence transformation models. Spoken-to-written style conversion involves multiple conversion problems including punctuation restoration, disfluency detection, and simplification. However, most existing corpora tend to be made for just one of these conversion problems. In addition, in Japanese, we have to consider not only general spoken-to-written style conversion problems but also Japanese-specific ones, such as language style unification (e.g., polite, frank, and direct styles) and omitted postpositional particle expressions restoration. Therefore, we created a new Japanese parallel corpus of spoken-style text and written-style text that can simultaneously handle general problems and Japanese-specific ones. To make this corpus, we prepared four types of spoken-style text and utilized a crowdsourcing service for manually converting them into written-style text. This paper describes the building setup of this corpus and reports the baseline results of spoken-to-written style conversion using the latest neural sequence transformation models.</abstract>
<identifier type="citekey">ihori-etal-2020-parallel</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.779</url>
</location>
<part>
<date>2020-may</date>
<extent unit="page">
<start>6346</start>
<end>6353</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Parallel Corpus for Japanese Spoken-to-Written Style Conversion
%A Ihori, Mana
%A Takashima, Akihiko
%A Masumura, Ryo
%S Proceedings of the 12th Language Resources and Evaluation Conference
%D 2020
%8 may
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F ihori-etal-2020-parallel
%X With the increase of automatic speech recognition (ASR) applications, spoken-to-written style conversion that transforms spoken-style text into written-style text is becoming an important technology to increase the readability of ASR transcriptions. To establish such conversion technology, a parallel corpus of spoken-style text and written-style text is beneficial because it can be utilized for building end-to-end neural sequence transformation models. Spoken-to-written style conversion involves multiple conversion problems including punctuation restoration, disfluency detection, and simplification. However, most existing corpora tend to be made for just one of these conversion problems. In addition, in Japanese, we have to consider not only general spoken-to-written style conversion problems but also Japanese-specific ones, such as language style unification (e.g., polite, frank, and direct styles) and omitted postpositional particle expressions restoration. Therefore, we created a new Japanese parallel corpus of spoken-style text and written-style text that can simultaneously handle general problems and Japanese-specific ones. To make this corpus, we prepared four types of spoken-style text and utilized a crowdsourcing service for manually converting them into written-style text. This paper describes the building setup of this corpus and reports the baseline results of spoken-to-written style conversion using the latest neural sequence transformation models.
%U https://aclanthology.org/2020.lrec-1.779
%P 6346-6353
Markdown (Informal)
[Parallel Corpus for Japanese Spoken-to-Written Style Conversion](https://aclanthology.org/2020.lrec-1.779) (Ihori et al., LREC 2020)
ACL