@inproceedings{guo-etal-2019-hierarchical,
title = "Hierarchical Document Encoder for Parallel Corpus Mining",
author = "Guo, Mandy and
Yang, Yinfei and
Stevens, Keith and
Cer, Daniel and
Ge, Heming and
Sung, Yun-hsuan and
Strope, Brian and
Kurzweil, Ray",
booktitle = "Proceedings of the Fourth Conference on Machine Translation (Volume 1: Research Papers)",
month = aug,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-5207",
doi = "10.18653/v1/W19-5207",
pages = "64--72",
abstract = "We explore using multilingual document embeddings for nearest neighbor mining of parallel data. Three document-level representations are investigated: (i) document embeddings generated by simply averaging multilingual sentence embeddings; (ii) a neural bag-of-words (BoW) document encoding model; (iii) a hierarchical multilingual document encoder (HiDE) that builds on our sentence-level model. The results show document embeddings derived from sentence-level averaging are surprisingly effective for clean datasets, but suggest models trained hierarchically at the document-level are more effective on noisy data. Analysis experiments demonstrate our hierarchical models are very robust to variations in the underlying sentence embedding quality. Using document embeddings trained with HiDE achieves the state-of-the-art on United Nations (UN) parallel document mining, 94.9{\%} P@1 for en-fr and 97.3{\%} P@1 for en-es.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="guo-etal-2019-hierarchical">
<titleInfo>
<title>Hierarchical Document Encoder for Parallel Corpus Mining</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mandy</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yinfei</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Keith</namePart>
<namePart type="family">Stevens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Cer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heming</namePart>
<namePart type="family">Ge</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-hsuan</namePart>
<namePart type="family">Sung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Strope</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ray</namePart>
<namePart type="family">Kurzweil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-aug</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Conference on Machine Translation (Volume 1: Research Papers)</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Florence, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We explore using multilingual document embeddings for nearest neighbor mining of parallel data. Three document-level representations are investigated: (i) document embeddings generated by simply averaging multilingual sentence embeddings; (ii) a neural bag-of-words (BoW) document encoding model; (iii) a hierarchical multilingual document encoder (HiDE) that builds on our sentence-level model. The results show document embeddings derived from sentence-level averaging are surprisingly effective for clean datasets, but suggest models trained hierarchically at the document-level are more effective on noisy data. Analysis experiments demonstrate our hierarchical models are very robust to variations in the underlying sentence embedding quality. Using document embeddings trained with HiDE achieves the state-of-the-art on United Nations (UN) parallel document mining, 94.9% P@1 for en-fr and 97.3% P@1 for en-es.</abstract>
<identifier type="citekey">guo-etal-2019-hierarchical</identifier>
<identifier type="doi">10.18653/v1/W19-5207</identifier>
<location>
<url>https://aclanthology.org/W19-5207</url>
</location>
<part>
<date>2019-aug</date>
<extent unit="page">
<start>64</start>
<end>72</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Hierarchical Document Encoder for Parallel Corpus Mining
%A Guo, Mandy
%A Yang, Yinfei
%A Stevens, Keith
%A Cer, Daniel
%A Ge, Heming
%A Sung, Yun-hsuan
%A Strope, Brian
%A Kurzweil, Ray
%S Proceedings of the Fourth Conference on Machine Translation (Volume 1: Research Papers)
%D 2019
%8 aug
%I Association for Computational Linguistics
%C Florence, Italy
%F guo-etal-2019-hierarchical
%X We explore using multilingual document embeddings for nearest neighbor mining of parallel data. Three document-level representations are investigated: (i) document embeddings generated by simply averaging multilingual sentence embeddings; (ii) a neural bag-of-words (BoW) document encoding model; (iii) a hierarchical multilingual document encoder (HiDE) that builds on our sentence-level model. The results show document embeddings derived from sentence-level averaging are surprisingly effective for clean datasets, but suggest models trained hierarchically at the document-level are more effective on noisy data. Analysis experiments demonstrate our hierarchical models are very robust to variations in the underlying sentence embedding quality. Using document embeddings trained with HiDE achieves the state-of-the-art on United Nations (UN) parallel document mining, 94.9% P@1 for en-fr and 97.3% P@1 for en-es.
%R 10.18653/v1/W19-5207
%U https://aclanthology.org/W19-5207
%U https://doi.org/10.18653/v1/W19-5207
%P 64-72
Markdown (Informal)
[Hierarchical Document Encoder for Parallel Corpus Mining](https://aclanthology.org/W19-5207) (Guo et al., 2019)
ACL
- Mandy Guo, Yinfei Yang, Keith Stevens, Daniel Cer, Heming Ge, Yun-hsuan Sung, Brian Strope, and Ray Kurzweil. 2019. Hierarchical Document Encoder for Parallel Corpus Mining. In Proceedings of the Fourth Conference on Machine Translation (Volume 1: Research Papers), pages 64–72, Florence, Italy. Association for Computational Linguistics.