@inproceedings{cho-etal-2020-discourse,
title = "Discourse Component to Sentence ({DC}2{S}): An Efficient Human-Aided Construction of Paraphrase and Sentence Similarity Dataset",
author = "Cho, Won Ik and
Kim, Jong In and
Moon, Young Ki and
Kim, Nam Soo",
booktitle = "Proceedings of the 12th Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.842",
pages = "6819--6826",
abstract = "Assessing the similarity of sentences and detecting paraphrases is an essential task both in theory and practice, but achieving a reliable dataset requires high resource. In this paper, we propose a discourse component-based paraphrase generation for the directive utterances, which is efficient in terms of human-aided construction and content preservation. All discourse components are expressed in natural language phrases, and the phrases are created considering both speech act and topic so that the controlled construction of the sentence similarity dataset is available. Here, we investigate the validity of our scheme using the Korean language, a language with diverse paraphrasing due to frequent subject drop and scramblings. With 1,000 intent argument phrases and thus generated 10,000 utterances, we make up a sentence similarity dataset of practically sufficient size. It contains five sentence pair types, including paraphrase, and displays a total volume of about 550K. To emphasize the utility of the scheme and dataset, we measure the similarity matching performance via conventional natural language inference models, also suggesting the multi-lingual extensibility.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cho-etal-2020-discourse">
<titleInfo>
<title>Discourse Component to Sentence (DC2S): An Efficient Human-Aided Construction of Paraphrase and Sentence Similarity Dataset</title>
</titleInfo>
<name type="personal">
<namePart type="given">Won</namePart>
<namePart type="given">Ik</namePart>
<namePart type="family">Cho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jong</namePart>
<namePart type="given">In</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Young</namePart>
<namePart type="given">Ki</namePart>
<namePart type="family">Moon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nam</namePart>
<namePart type="given">Soo</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Language Resources and Evaluation Conference</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>Assessing the similarity of sentences and detecting paraphrases is an essential task both in theory and practice, but achieving a reliable dataset requires high resource. In this paper, we propose a discourse component-based paraphrase generation for the directive utterances, which is efficient in terms of human-aided construction and content preservation. All discourse components are expressed in natural language phrases, and the phrases are created considering both speech act and topic so that the controlled construction of the sentence similarity dataset is available. Here, we investigate the validity of our scheme using the Korean language, a language with diverse paraphrasing due to frequent subject drop and scramblings. With 1,000 intent argument phrases and thus generated 10,000 utterances, we make up a sentence similarity dataset of practically sufficient size. It contains five sentence pair types, including paraphrase, and displays a total volume of about 550K. To emphasize the utility of the scheme and dataset, we measure the similarity matching performance via conventional natural language inference models, also suggesting the multi-lingual extensibility.</abstract>
<identifier type="citekey">cho-etal-2020-discourse</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.842</url>
</location>
<part>
<date>2020-may</date>
<extent unit="page">
<start>6819</start>
<end>6826</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Discourse Component to Sentence (DC2S): An Efficient Human-Aided Construction of Paraphrase and Sentence Similarity Dataset
%A Cho, Won Ik
%A Kim, Jong In
%A Moon, Young Ki
%A Kim, Nam Soo
%S Proceedings of the 12th Language Resources and Evaluation Conference
%D 2020
%8 may
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F cho-etal-2020-discourse
%X Assessing the similarity of sentences and detecting paraphrases is an essential task both in theory and practice, but achieving a reliable dataset requires high resource. In this paper, we propose a discourse component-based paraphrase generation for the directive utterances, which is efficient in terms of human-aided construction and content preservation. All discourse components are expressed in natural language phrases, and the phrases are created considering both speech act and topic so that the controlled construction of the sentence similarity dataset is available. Here, we investigate the validity of our scheme using the Korean language, a language with diverse paraphrasing due to frequent subject drop and scramblings. With 1,000 intent argument phrases and thus generated 10,000 utterances, we make up a sentence similarity dataset of practically sufficient size. It contains five sentence pair types, including paraphrase, and displays a total volume of about 550K. To emphasize the utility of the scheme and dataset, we measure the similarity matching performance via conventional natural language inference models, also suggesting the multi-lingual extensibility.
%U https://aclanthology.org/2020.lrec-1.842
%P 6819-6826
Markdown (Informal)
[Discourse Component to Sentence (DC2S): An Efficient Human-Aided Construction of Paraphrase and Sentence Similarity Dataset](https://aclanthology.org/2020.lrec-1.842) (Cho et al., LREC 2020)
ACL