@inproceedings{carlsson-etal-2021-gandalf,
title = "{GANDALF}: a General Character Name Description Dataset for Long Fiction",
author = "Carlsson, Fredrik and
Sahlgren, Magnus and
Olsson, Fredrik and
Cuba Gyllensten, Amaru",
booktitle = "Proceedings of the 3rd Workshop on Machine Reading for Question Answering",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.mrqa-1.13",
doi = "10.18653/v1/2021.mrqa-1.13",
pages = "119--132",
abstract = "This paper introduces a long-range multiple-choice Question Answering (QA) dataset, based on full-length fiction book texts. The questions are formulated as 10-way multiple-choice questions, where the task is to select the correct character name given a character description, or vice-versa. Each character description is formulated in natural text and often contains information from several sections throughout the book. We provide 20,000 questions created from 10,000 manually annotated descriptions of characters from 177 books containing 152,917 words on average. We address the current discourse regarding dataset bias and leakage by a simple anonymization procedure, which in turn enables interesting probing possibilities. Finally, we show that suitable baseline algorithms perform very poorly on this task, with the book size itself making it non-trivial to attempt a Transformer-based QA solution. This leaves ample room for future improvement, and hints at the need for a completely different type of solution.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="carlsson-etal-2021-gandalf">
<titleInfo>
<title>GANDALF: a General Character Name Description Dataset for Long Fiction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fredrik</namePart>
<namePart type="family">Carlsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Magnus</namePart>
<namePart type="family">Sahlgren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fredrik</namePart>
<namePart type="family">Olsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amaru</namePart>
<namePart type="family">Cuba Gyllensten</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Workshop on Machine Reading for Question Answering</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper introduces a long-range multiple-choice Question Answering (QA) dataset, based on full-length fiction book texts. The questions are formulated as 10-way multiple-choice questions, where the task is to select the correct character name given a character description, or vice-versa. Each character description is formulated in natural text and often contains information from several sections throughout the book. We provide 20,000 questions created from 10,000 manually annotated descriptions of characters from 177 books containing 152,917 words on average. We address the current discourse regarding dataset bias and leakage by a simple anonymization procedure, which in turn enables interesting probing possibilities. Finally, we show that suitable baseline algorithms perform very poorly on this task, with the book size itself making it non-trivial to attempt a Transformer-based QA solution. This leaves ample room for future improvement, and hints at the need for a completely different type of solution.</abstract>
<identifier type="citekey">carlsson-etal-2021-gandalf</identifier>
<identifier type="doi">10.18653/v1/2021.mrqa-1.13</identifier>
<location>
<url>https://aclanthology.org/2021.mrqa-1.13</url>
</location>
<part>
<date>2021-nov</date>
<extent unit="page">
<start>119</start>
<end>132</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GANDALF: a General Character Name Description Dataset for Long Fiction
%A Carlsson, Fredrik
%A Sahlgren, Magnus
%A Olsson, Fredrik
%A Cuba Gyllensten, Amaru
%S Proceedings of the 3rd Workshop on Machine Reading for Question Answering
%D 2021
%8 nov
%I Association for Computational Linguistics
%C Punta Cana, Dominican Republic
%F carlsson-etal-2021-gandalf
%X This paper introduces a long-range multiple-choice Question Answering (QA) dataset, based on full-length fiction book texts. The questions are formulated as 10-way multiple-choice questions, where the task is to select the correct character name given a character description, or vice-versa. Each character description is formulated in natural text and often contains information from several sections throughout the book. We provide 20,000 questions created from 10,000 manually annotated descriptions of characters from 177 books containing 152,917 words on average. We address the current discourse regarding dataset bias and leakage by a simple anonymization procedure, which in turn enables interesting probing possibilities. Finally, we show that suitable baseline algorithms perform very poorly on this task, with the book size itself making it non-trivial to attempt a Transformer-based QA solution. This leaves ample room for future improvement, and hints at the need for a completely different type of solution.
%R 10.18653/v1/2021.mrqa-1.13
%U https://aclanthology.org/2021.mrqa-1.13
%U https://doi.org/10.18653/v1/2021.mrqa-1.13
%P 119-132
Markdown (Informal)
[GANDALF: a General Character Name Description Dataset for Long Fiction](https://aclanthology.org/2021.mrqa-1.13) (Carlsson et al., MRQA 2021)
ACL