@inproceedings{reijnaers-pouw-2024-gtnc,
title = "{GTNC}: A Many-To-One Dataset of {G}oogle Translations from {N}ews{C}rawl",
author = "Reijnaers, Damiaan and
Pouw, Charlotte",
editor = "Hahn, Michael and
Sorokin, Alexey and
Kumar, Ritesh and
Shcherbakov, Andreas and
Otmakhova, Yulia and
Yang, Jinrui and
Serikov, Oleg and
Rani, Priya and
Ponti, Edoardo M. and
Murado{\u{g}}lu, Saliha and
Gao, Rena and
Cotterell, Ryan and
Vylomova, Ekaterina",
booktitle = "Proceedings of the 6th Workshop on Research in Computational Linguistic Typology and Multilingual NLP",
month = mar,
year = "2024",
address = "St. Julian's, Malta",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.sigtyp-1.8/",
pages = "58--65",
abstract = "This paper lays the groundwork for initiating research into Source Language Identification; the task of identifying the original language of a machine-translated text. We contribute a dataset of translations from a typologically diverse spectrum of languages into English and use it to set initial baselines for this novel task."
}
Markdown (Informal)
[GTNC: A Many-To-One Dataset of Google Translations from NewsCrawl](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.sigtyp-1.8/) (Reijnaers & Pouw, SIGTYP 2024)
ACL