@inproceedings{thompson-koehn-2020-exploiting,
title = "Exploiting Sentence Order in Document Alignment",
author = "Thompson, Brian and
Koehn, Philipp",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.emnlp-main.483",
doi = "10.18653/v1/2020.emnlp-main.483",
pages = "5997--6007",
abstract = "We present a simple document alignment method that incorporates sentence order information in both candidate generation and candidate re-scoring. Our method results in 61{\%} relative reduction in error compared to the best previously published result on the WMT16 document alignment shared task. Our method improves downstream MT performance on web-scraped Sinhala{--}English documents from ParaCrawl, outperforming the document alignment method used in the most recent ParaCrawl release. It also outperforms a comparable corpora method which uses the same multilingual embeddings, demonstrating that exploiting sentence order is beneficial even if the end goal is sentence-level bitext.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="thompson-koehn-2020-exploiting">
<titleInfo>
<title>Exploiting Sentence Order in Document Alignment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Thompson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a simple document alignment method that incorporates sentence order information in both candidate generation and candidate re-scoring. Our method results in 61% relative reduction in error compared to the best previously published result on the WMT16 document alignment shared task. Our method improves downstream MT performance on web-scraped Sinhala–English documents from ParaCrawl, outperforming the document alignment method used in the most recent ParaCrawl release. It also outperforms a comparable corpora method which uses the same multilingual embeddings, demonstrating that exploiting sentence order is beneficial even if the end goal is sentence-level bitext.</abstract>
<identifier type="citekey">thompson-koehn-2020-exploiting</identifier>
<identifier type="doi">10.18653/v1/2020.emnlp-main.483</identifier>
<location>
<url>https://aclanthology.org/2020.emnlp-main.483</url>
</location>
<part>
<date>2020-nov</date>
<extent unit="page">
<start>5997</start>
<end>6007</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Exploiting Sentence Order in Document Alignment
%A Thompson, Brian
%A Koehn, Philipp
%S Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)
%D 2020
%8 nov
%I Association for Computational Linguistics
%C Online
%F thompson-koehn-2020-exploiting
%X We present a simple document alignment method that incorporates sentence order information in both candidate generation and candidate re-scoring. Our method results in 61% relative reduction in error compared to the best previously published result on the WMT16 document alignment shared task. Our method improves downstream MT performance on web-scraped Sinhala–English documents from ParaCrawl, outperforming the document alignment method used in the most recent ParaCrawl release. It also outperforms a comparable corpora method which uses the same multilingual embeddings, demonstrating that exploiting sentence order is beneficial even if the end goal is sentence-level bitext.
%R 10.18653/v1/2020.emnlp-main.483
%U https://aclanthology.org/2020.emnlp-main.483
%U https://doi.org/10.18653/v1/2020.emnlp-main.483
%P 5997-6007
Markdown (Informal)
[Exploiting Sentence Order in Document Alignment](https://aclanthology.org/2020.emnlp-main.483) (Thompson & Koehn, EMNLP 2020)
ACL
- Brian Thompson and Philipp Koehn. 2020. Exploiting Sentence Order in Document Alignment. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pages 5997–6007, Online. Association for Computational Linguistics.