@inproceedings{deng-etal-2020-integrating,
title = "Integrating Disfluency-based and Prosodic Features with Acoustics in Automatic Fluency Evaluation of Spontaneous Speech",
author = "Deng, Huaijin and
Lin, Youchao and
Utsuro, Takehito and
Kobayashi, Akio and
Nishizaki, Hiromitsu and
Hoshino, Junichi",
booktitle = "Proceedings of the 12th Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.791",
pages = "6429--6437",
abstract = "This paper describes an automatic fluency evaluation of spontaneous speech. In the task of automatic fluency evaluation, we integrate diverse features of acoustics, prosody, and disfluency-based ones. Then, we attempt to reveal the contribution of each of those diverse features to the task of automatic fluency evaluation. Although a variety of different disfluencies are observed regularly in spontaneous speech, we focus on two types of phenomena, i.e., filled pauses and word fragments. The experimental results demonstrate that the disfluency-based features derived from word fragments and filled pauses are effective relative to evaluating fluent/disfluent speech, especially when combined with prosodic features, e.g., such as speech rate and pauses/silence. Next, we employed an LSTM based framework in order to integrate the disfluency-based and prosodic features with time sequential acoustic features. The experimental evaluation results of those integrated diverse features indicate that time sequential acoustic features contribute to improving the model with disfluency-based and prosodic features when detecting fluent speech, but not when detecting disfluent speech. Furthermore, when detecting disfluent speech, the model without time sequential acoustic features performs best even without word fragments features, but only with filled pauses and prosodic features.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="deng-etal-2020-integrating">
<titleInfo>
<title>Integrating Disfluency-based and Prosodic Features with Acoustics in Automatic Fluency Evaluation of Spontaneous Speech</title>
</titleInfo>
<name type="personal">
<namePart type="given">Huaijin</namePart>
<namePart type="family">Deng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Youchao</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Takehito</namePart>
<namePart type="family">Utsuro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akio</namePart>
<namePart type="family">Kobayashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiromitsu</namePart>
<namePart type="family">Nishizaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junichi</namePart>
<namePart type="family">Hoshino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Language Resources and Evaluation Conference</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>This paper describes an automatic fluency evaluation of spontaneous speech. In the task of automatic fluency evaluation, we integrate diverse features of acoustics, prosody, and disfluency-based ones. Then, we attempt to reveal the contribution of each of those diverse features to the task of automatic fluency evaluation. Although a variety of different disfluencies are observed regularly in spontaneous speech, we focus on two types of phenomena, i.e., filled pauses and word fragments. The experimental results demonstrate that the disfluency-based features derived from word fragments and filled pauses are effective relative to evaluating fluent/disfluent speech, especially when combined with prosodic features, e.g., such as speech rate and pauses/silence. Next, we employed an LSTM based framework in order to integrate the disfluency-based and prosodic features with time sequential acoustic features. The experimental evaluation results of those integrated diverse features indicate that time sequential acoustic features contribute to improving the model with disfluency-based and prosodic features when detecting fluent speech, but not when detecting disfluent speech. Furthermore, when detecting disfluent speech, the model without time sequential acoustic features performs best even without word fragments features, but only with filled pauses and prosodic features.</abstract>
<identifier type="citekey">deng-etal-2020-integrating</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.791</url>
</location>
<part>
<date>2020-may</date>
<extent unit="page">
<start>6429</start>
<end>6437</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Integrating Disfluency-based and Prosodic Features with Acoustics in Automatic Fluency Evaluation of Spontaneous Speech
%A Deng, Huaijin
%A Lin, Youchao
%A Utsuro, Takehito
%A Kobayashi, Akio
%A Nishizaki, Hiromitsu
%A Hoshino, Junichi
%S Proceedings of the 12th Language Resources and Evaluation Conference
%D 2020
%8 may
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F deng-etal-2020-integrating
%X This paper describes an automatic fluency evaluation of spontaneous speech. In the task of automatic fluency evaluation, we integrate diverse features of acoustics, prosody, and disfluency-based ones. Then, we attempt to reveal the contribution of each of those diverse features to the task of automatic fluency evaluation. Although a variety of different disfluencies are observed regularly in spontaneous speech, we focus on two types of phenomena, i.e., filled pauses and word fragments. The experimental results demonstrate that the disfluency-based features derived from word fragments and filled pauses are effective relative to evaluating fluent/disfluent speech, especially when combined with prosodic features, e.g., such as speech rate and pauses/silence. Next, we employed an LSTM based framework in order to integrate the disfluency-based and prosodic features with time sequential acoustic features. The experimental evaluation results of those integrated diverse features indicate that time sequential acoustic features contribute to improving the model with disfluency-based and prosodic features when detecting fluent speech, but not when detecting disfluent speech. Furthermore, when detecting disfluent speech, the model without time sequential acoustic features performs best even without word fragments features, but only with filled pauses and prosodic features.
%U https://aclanthology.org/2020.lrec-1.791
%P 6429-6437
Markdown (Informal)
[Integrating Disfluency-based and Prosodic Features with Acoustics in Automatic Fluency Evaluation of Spontaneous Speech](https://aclanthology.org/2020.lrec-1.791) (Deng et al., LREC 2020)
ACL