@inproceedings{rikters-etal-2020-document,
title = "Document-aligned {J}apanese-{E}nglish Conversation Parallel Corpus",
author = "Rikters, Mat{\={i}}ss and
Ri, Ryokan and
Li, Tong and
Nakazawa, Toshiaki",
editor = {Barrault, Lo{\"i}c and
Bojar, Ond{\v{r}}ej and
Bougares, Fethi and
Chatterjee, Rajen and
Costa-juss{\`a}, Marta R. and
Federmann, Christian and
Fishel, Mark and
Fraser, Alexander and
Graham, Yvette and
Guzman, Paco and
Haddow, Barry and
Huck, Matthias and
Yepes, Antonio Jimeno and
Koehn, Philipp and
Martins, Andr{\'e} and
Morishita, Makoto and
Monz, Christof and
Nagata, Masaaki and
Nakazawa, Toshiaki and
Negri, Matteo},
booktitle = "Proceedings of the Fifth Conference on Machine Translation",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/moar-dois/2020.wmt-1.74/",
pages = "639--645",
abstract = "Sentence-level (SL) machine translation (MT) has reached acceptable quality for many high-resourced languages, but not document-level (DL) MT, which is difficult to 1) train with little amount of DL data; and 2) evaluate, as the main methods and data sets focus on SL evaluation. To address the first issue, we present a document-aligned Japanese-English conversation corpus, including balanced, high-quality business conversation data for tuning and testing. As for the second issue, we manually identify the main areas where SL MT fails to produce adequate translations in lack of context. We then create an evaluation set where these phenomena are annotated to alleviate automatic evaluation of DL systems. We train MT models using our corpus to demonstrate how using context leads to improvements."
}
Markdown (Informal)
[Document-aligned Japanese-English Conversation Parallel Corpus](https://preview.aclanthology.org/moar-dois/2020.wmt-1.74/) (Rikters et al., WMT 2020)
ACL