@inproceedings{santhanam-shaikh-2020-understanding,
title = "Understanding the Impact of Experiment Design for Evaluating Dialogue System Output",
author = "Santhanam, Sashank and
Shaikh, Samira",
booktitle = "Proceedings of the The Fourth Widening Natural Language Processing Workshop",
month = jul,
year = "2020",
address = "Seattle, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.winlp-1.33",
doi = "10.18653/v1/2020.winlp-1.33",
pages = "124--127",
abstract = "Evaluation of output from natural language generation (NLG) systems is typically conducted via crowdsourced human judgments. To understand the impact of how experiment design might affect the quality and consistency of such human judgments, we designed a between-subjects study with four experimental conditions. Through our systematic study with 40 crowdsourced workers in each task, we find that using continuous scales achieves more consistent ratings than Likert scale or ranking-based experiment design. Additionally, we find that factors such as no prior experience of participating in similar studies of rating dialogue system output",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="santhanam-shaikh-2020-understanding">
<titleInfo>
<title>Understanding the Impact of Experiment Design for Evaluating Dialogue System Output</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sashank</namePart>
<namePart type="family">Santhanam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samira</namePart>
<namePart type="family">Shaikh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-jul</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the The Fourth Widening Natural Language Processing Workshop</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Evaluation of output from natural language generation (NLG) systems is typically conducted via crowdsourced human judgments. To understand the impact of how experiment design might affect the quality and consistency of such human judgments, we designed a between-subjects study with four experimental conditions. Through our systematic study with 40 crowdsourced workers in each task, we find that using continuous scales achieves more consistent ratings than Likert scale or ranking-based experiment design. Additionally, we find that factors such as no prior experience of participating in similar studies of rating dialogue system output</abstract>
<identifier type="citekey">santhanam-shaikh-2020-understanding</identifier>
<identifier type="doi">10.18653/v1/2020.winlp-1.33</identifier>
<location>
<url>https://aclanthology.org/2020.winlp-1.33</url>
</location>
<part>
<date>2020-jul</date>
<extent unit="page">
<start>124</start>
<end>127</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Understanding the Impact of Experiment Design for Evaluating Dialogue System Output
%A Santhanam, Sashank
%A Shaikh, Samira
%S Proceedings of the The Fourth Widening Natural Language Processing Workshop
%D 2020
%8 jul
%I Association for Computational Linguistics
%C Seattle, USA
%F santhanam-shaikh-2020-understanding
%X Evaluation of output from natural language generation (NLG) systems is typically conducted via crowdsourced human judgments. To understand the impact of how experiment design might affect the quality and consistency of such human judgments, we designed a between-subjects study with four experimental conditions. Through our systematic study with 40 crowdsourced workers in each task, we find that using continuous scales achieves more consistent ratings than Likert scale or ranking-based experiment design. Additionally, we find that factors such as no prior experience of participating in similar studies of rating dialogue system output
%R 10.18653/v1/2020.winlp-1.33
%U https://aclanthology.org/2020.winlp-1.33
%U https://doi.org/10.18653/v1/2020.winlp-1.33
%P 124-127
Markdown (Informal)
[Understanding the Impact of Experiment Design for Evaluating Dialogue System Output](https://aclanthology.org/2020.winlp-1.33) (Santhanam & Shaikh, WiNLP 2020)
ACL