@inproceedings{nguyen-etal-2020-vietnamese,
title = "A {V}ietnamese Dataset for Evaluating Machine Reading Comprehension",
author = "Nguyen, Kiet and
Nguyen, Vu and
Nguyen, Anh and
Nguyen, Ngan",
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2020.coling-main.233/",
doi = "10.18653/v1/2020.coling-main.233",
pages = "2595--2605",
abstract = "Over 97 million inhabitants speak Vietnamese as the native language in the world. However, there are few research studies on machine reading comprehension (MRC) in Vietnamese, the task of understanding a document or text, and answering questions related to it. Due to the lack of benchmark datasets for Vietnamese, we present the Vietnamese Question Answering Dataset (UIT-ViQuAD), a new dataset for the low-resource language as Vietnamese to evaluate MRC models. This dataset comprises over 23,000 human-generated question-answer pairs based on 5,109 passages of 174 Vietnamese articles from Wikipedia. In particular, we propose a new process of dataset creation for Vietnamese MRC. Our in-depth analyses illustrate that our dataset requires abilities beyond simple reasoning like word matching and demands complicate reasoning such as single-sentence and multiple-sentence inferences. Besides, we conduct experiments on state-of-the-art MRC methods in English and Chinese as the first experimental models on UIT-ViQuAD, which will be compared to further models. We also estimate human performances on the dataset and compare it to the experimental results of several powerful machine models. As a result, the substantial differences between humans and the best model performances on the dataset indicate that improvements can be explored on UIT-ViQuAD through future research. Our dataset is freely available to encourage the research community to overcome challenges in Vietnamese MRC."
}
Markdown (Informal)
[A Vietnamese Dataset for Evaluating Machine Reading Comprehension](https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2020.coling-main.233/) (Nguyen et al., COLING 2020)
ACL