@inproceedings{yue-etal-2020-clinical,
title = "Clinical Reading Comprehension: A Thorough Analysis of the emr{QA} Dataset",
author = "Yue, Xiang and
Jimenez Gutierrez, Bernal and
Sun, Huan",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-main.410",
doi = "10.18653/v1/2020.acl-main.410",
pages = "4474--4486",
abstract = "Machine reading comprehension has made great progress in recent years owing to large-scale annotated datasets. In the clinical domain, however, creating such datasets is quite difficult due to the domain expertise required for annotation. Recently, Pampari et al. (EMNLP{'}18) tackled this issue by using expert-annotated question templates and existing i2b2 annotations to create emrQA, the first large-scale dataset for question answering (QA) based on clinical notes. In this paper, we provide an in-depth analysis of this dataset and the clinical reading comprehension (CliniRC) task. From our qualitative analysis, we find that (i) emrQA answers are often incomplete, and (ii) emrQA questions are often answerable without using domain knowledge. From our quantitative experiments, surprising results include that (iii) using a small sampled subset (5{\%}-20{\%}), we can obtain roughly equal performance compared to the model trained on the entire dataset, (iv) this performance is close to human expert{'}s performance, and (v) BERT models do not beat the best performing base model. Following our analysis of the emrQA, we further explore two desired aspects of CliniRC systems: the ability to utilize clinical domain knowledge and to generalize to unseen questions and contexts. We argue that both should be considered when creating future datasets.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yue-etal-2020-clinical">
<titleInfo>
<title>Clinical Reading Comprehension: A Thorough Analysis of the emrQA Dataset</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Yue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bernal</namePart>
<namePart type="family">Jimenez Gutierrez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huan</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-jul</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Machine reading comprehension has made great progress in recent years owing to large-scale annotated datasets. In the clinical domain, however, creating such datasets is quite difficult due to the domain expertise required for annotation. Recently, Pampari et al. (EMNLP’18) tackled this issue by using expert-annotated question templates and existing i2b2 annotations to create emrQA, the first large-scale dataset for question answering (QA) based on clinical notes. In this paper, we provide an in-depth analysis of this dataset and the clinical reading comprehension (CliniRC) task. From our qualitative analysis, we find that (i) emrQA answers are often incomplete, and (ii) emrQA questions are often answerable without using domain knowledge. From our quantitative experiments, surprising results include that (iii) using a small sampled subset (5%-20%), we can obtain roughly equal performance compared to the model trained on the entire dataset, (iv) this performance is close to human expert’s performance, and (v) BERT models do not beat the best performing base model. Following our analysis of the emrQA, we further explore two desired aspects of CliniRC systems: the ability to utilize clinical domain knowledge and to generalize to unseen questions and contexts. We argue that both should be considered when creating future datasets.</abstract>
<identifier type="citekey">yue-etal-2020-clinical</identifier>
<identifier type="doi">10.18653/v1/2020.acl-main.410</identifier>
<location>
<url>https://aclanthology.org/2020.acl-main.410</url>
</location>
<part>
<date>2020-jul</date>
<extent unit="page">
<start>4474</start>
<end>4486</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Clinical Reading Comprehension: A Thorough Analysis of the emrQA Dataset
%A Yue, Xiang
%A Jimenez Gutierrez, Bernal
%A Sun, Huan
%S Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics
%D 2020
%8 jul
%I Association for Computational Linguistics
%C Online
%F yue-etal-2020-clinical
%X Machine reading comprehension has made great progress in recent years owing to large-scale annotated datasets. In the clinical domain, however, creating such datasets is quite difficult due to the domain expertise required for annotation. Recently, Pampari et al. (EMNLP’18) tackled this issue by using expert-annotated question templates and existing i2b2 annotations to create emrQA, the first large-scale dataset for question answering (QA) based on clinical notes. In this paper, we provide an in-depth analysis of this dataset and the clinical reading comprehension (CliniRC) task. From our qualitative analysis, we find that (i) emrQA answers are often incomplete, and (ii) emrQA questions are often answerable without using domain knowledge. From our quantitative experiments, surprising results include that (iii) using a small sampled subset (5%-20%), we can obtain roughly equal performance compared to the model trained on the entire dataset, (iv) this performance is close to human expert’s performance, and (v) BERT models do not beat the best performing base model. Following our analysis of the emrQA, we further explore two desired aspects of CliniRC systems: the ability to utilize clinical domain knowledge and to generalize to unseen questions and contexts. We argue that both should be considered when creating future datasets.
%R 10.18653/v1/2020.acl-main.410
%U https://aclanthology.org/2020.acl-main.410
%U https://doi.org/10.18653/v1/2020.acl-main.410
%P 4474-4486
Markdown (Informal)
[Clinical Reading Comprehension: A Thorough Analysis of the emrQA Dataset](https://aclanthology.org/2020.acl-main.410) (Yue et al., ACL 2020)
ACL