@inproceedings{zhang-etal-2021-mr,
title = "Mr. {T}y{D}i: A Multi-lingual Benchmark for Dense Retrieval",
author = "Zhang, Xinyu and
Ma, Xueguang and
Shi, Peng and
Lin, Jimmy",
editor = "Ataman, Duygu and
Birch, Alexandra and
Conneau, Alexis and
Firat, Orhan and
Ruder, Sebastian and
Sahin, Gozde Gul",
booktitle = "Proceedings of the 1st Workshop on Multilingual Representation Learning",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.mrl-1.12/",
doi = "10.18653/v1/2021.mrl-1.12",
pages = "127--137",
abstract = "We present Mr. TyDi, a multi-lingual benchmark dataset for mono-lingual retrieval in eleven typologically diverse languages, designed to evaluate ranking with learned dense representations. The goal of this resource is to spur research in dense retrieval techniques in non-English languages, motivated by recent observations that existing techniques for representation learning perform poorly when applied to out-of-distribution data. As a starting point, we provide zero-shot baselines for this new dataset based on a multi-lingual adaptation of DPR that we call ``mDPR''. Experiments show that although the effectiveness of mDPR is much lower than BM25, dense representations nevertheless appear to provide valuable relevance signals, improving BM25 results in sparse{--}dense hybrids. In addition to analyses of our results, we also discuss future challenges and present a research agenda in multi-lingual dense retrieval. Mr. TyDi can be downloaded at \url{https://github.com/castorini/mr.tydi}."
}
Markdown (Informal)
[Mr. TyDi: A Multi-lingual Benchmark for Dense Retrieval](https://preview.aclanthology.org/fix-sig-urls/2021.mrl-1.12/) (Zhang et al., MRL 2021)
ACL
- Xinyu Zhang, Xueguang Ma, Peng Shi, and Jimmy Lin. 2021. Mr. TyDi: A Multi-lingual Benchmark for Dense Retrieval. In Proceedings of the 1st Workshop on Multilingual Representation Learning, pages 127–137, Punta Cana, Dominican Republic. Association for Computational Linguistics.