@inproceedings{rikters-etal-2020-document,
title = "Document-aligned {J}apanese-{E}nglish Conversation Parallel Corpus",
author = "Rikters, Mat{\=\i}ss and
Ri, Ryokan and
Li, Tong and
Nakazawa, Toshiaki",
booktitle = "Proceedings of the Fifth Conference on Machine Translation",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.wmt-1.74",
pages = "639--645",
abstract = "Sentence-level (SL) machine translation (MT) has reached acceptable quality for many high-resourced languages, but not document-level (DL) MT, which is difficult to 1) train with little amount of DL data; and 2) evaluate, as the main methods and data sets focus on SL evaluation. To address the first issue, we present a document-aligned Japanese-English conversation corpus, including balanced, high-quality business conversation data for tuning and testing. As for the second issue, we manually identify the main areas where SL MT fails to produce adequate translations in lack of context. We then create an evaluation set where these phenomena are annotated to alleviate automatic evaluation of DL systems. We train MT models using our corpus to demonstrate how using context leads to improvements.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rikters-etal-2020-document">
<titleInfo>
<title>Document-aligned Japanese-English Conversation Parallel Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mat\=\iss</namePart>
<namePart type="family">Rikters</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryokan</namePart>
<namePart type="family">Ri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tong</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Toshiaki</namePart>
<namePart type="family">Nakazawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Conference on Machine Translation</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Sentence-level (SL) machine translation (MT) has reached acceptable quality for many high-resourced languages, but not document-level (DL) MT, which is difficult to 1) train with little amount of DL data; and 2) evaluate, as the main methods and data sets focus on SL evaluation. To address the first issue, we present a document-aligned Japanese-English conversation corpus, including balanced, high-quality business conversation data for tuning and testing. As for the second issue, we manually identify the main areas where SL MT fails to produce adequate translations in lack of context. We then create an evaluation set where these phenomena are annotated to alleviate automatic evaluation of DL systems. We train MT models using our corpus to demonstrate how using context leads to improvements.</abstract>
<identifier type="citekey">rikters-etal-2020-document</identifier>
<location>
<url>https://aclanthology.org/2020.wmt-1.74</url>
</location>
<part>
<date>2020-nov</date>
<extent unit="page">
<start>639</start>
<end>645</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Document-aligned Japanese-English Conversation Parallel Corpus
%A Rikters, Mat\=\iss
%A Ri, Ryokan
%A Li, Tong
%A Nakazawa, Toshiaki
%S Proceedings of the Fifth Conference on Machine Translation
%D 2020
%8 nov
%I Association for Computational Linguistics
%C Online
%F rikters-etal-2020-document
%X Sentence-level (SL) machine translation (MT) has reached acceptable quality for many high-resourced languages, but not document-level (DL) MT, which is difficult to 1) train with little amount of DL data; and 2) evaluate, as the main methods and data sets focus on SL evaluation. To address the first issue, we present a document-aligned Japanese-English conversation corpus, including balanced, high-quality business conversation data for tuning and testing. As for the second issue, we manually identify the main areas where SL MT fails to produce adequate translations in lack of context. We then create an evaluation set where these phenomena are annotated to alleviate automatic evaluation of DL systems. We train MT models using our corpus to demonstrate how using context leads to improvements.
%U https://aclanthology.org/2020.wmt-1.74
%P 639-645
Markdown (Informal)
[Document-aligned Japanese-English Conversation Parallel Corpus](https://aclanthology.org/2020.wmt-1.74) (Rikters et al., WMT 2020)
ACL