@inproceedings{devine-2024-tagengo,
title = "Tagengo: A Multilingual Chat Dataset",
author = "Devine, Peter",
editor = {S{\"a}lev{\"a}, Jonne and
Owodunni, Abraham},
booktitle = "Proceedings of the Fourth Workshop on Multilingual Representation Learning (MRL 2024)",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.mrl-1.6/",
doi = "10.18653/v1/2024.mrl-1.6",
pages = "106--113",
abstract = "Open source large language models (LLMs) have shown great improvements in recent times. However, many of these models are focused solely on popular spoken languages. We present a high quality dataset of more than 70k prompt-response pairs in 74 languages which consist of human generated prompts and synthetic responses. We use this dataset to train a state-of-the-art open source English LLM to chat multilingually.We evaluate our model on MT-Bench chat benchmarks in 6 languages, finding that our multilingual model outperforms previous state-of-the-art open source LLMs across each language. We further find that training on more multilingual data is beneficial to the performance in a chosen target language (Japanese) compared to simply training on only data in that language.These results indicate the necessity of training on large amounts of high quality multilingual data to make a more accessible LLM."
}
Markdown (Informal)
[Tagengo: A Multilingual Chat Dataset](https://preview.aclanthology.org/fix-sig-urls/2024.mrl-1.6/) (Devine, MRL 2024)
ACL
- Peter Devine. 2024. Tagengo: A Multilingual Chat Dataset. In Proceedings of the Fourth Workshop on Multilingual Representation Learning (MRL 2024), pages 106–113, Miami, Florida, USA. Association for Computational Linguistics.