@inproceedings{taille-etal-2020-lets,
title = "Let{'}s {S}top {I}ncorrect {C}omparisons in {E}nd-to-end {R}elation {E}xtraction!",
author = "Taill{\'e}, Bruno and
Guigue, Vincent and
Scoutheeten, Geoffrey and
Gallinari, Patrick",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.emnlp-main.301",
doi = "10.18653/v1/2020.emnlp-main.301",
pages = "3689--3701",
abstract = "Despite efforts to distinguish three different evaluation setups (Bekoulis et al., 2018), numerous end-to-end Relation Extraction (RE) articles present unreliable performance comparison to previous work. In this paper, we first identify several patterns of invalid comparisons in published papers and describe them to avoid their propagation. We then propose a small empirical study to quantify the most common mistake{'}s impact and evaluate it leads to overestimating the final RE performance by around 5{\%} on ACE05. We also seize this opportunity to study the unexplored ablations of two recent developments: the use of language model pretraining (specifically BERT) and span-level NER. This meta-analysis emphasizes the need for rigor in the report of both the evaluation setting and the dataset statistics. We finally call for unifying the evaluation setting in end-to-end RE.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="taille-etal-2020-lets">
<titleInfo>
<title>Let’s Stop Incorrect Comparisons in End-to-end Relation Extraction!</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bruno</namePart>
<namePart type="family">Taillé</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vincent</namePart>
<namePart type="family">Guigue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Geoffrey</namePart>
<namePart type="family">Scoutheeten</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Gallinari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Despite efforts to distinguish three different evaluation setups (Bekoulis et al., 2018), numerous end-to-end Relation Extraction (RE) articles present unreliable performance comparison to previous work. In this paper, we first identify several patterns of invalid comparisons in published papers and describe them to avoid their propagation. We then propose a small empirical study to quantify the most common mistake’s impact and evaluate it leads to overestimating the final RE performance by around 5% on ACE05. We also seize this opportunity to study the unexplored ablations of two recent developments: the use of language model pretraining (specifically BERT) and span-level NER. This meta-analysis emphasizes the need for rigor in the report of both the evaluation setting and the dataset statistics. We finally call for unifying the evaluation setting in end-to-end RE.</abstract>
<identifier type="citekey">taille-etal-2020-lets</identifier>
<identifier type="doi">10.18653/v1/2020.emnlp-main.301</identifier>
<location>
<url>https://aclanthology.org/2020.emnlp-main.301</url>
</location>
<part>
<date>2020-nov</date>
<extent unit="page">
<start>3689</start>
<end>3701</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Let’s Stop Incorrect Comparisons in End-to-end Relation Extraction!
%A Taillé, Bruno
%A Guigue, Vincent
%A Scoutheeten, Geoffrey
%A Gallinari, Patrick
%S Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)
%D 2020
%8 nov
%I Association for Computational Linguistics
%C Online
%F taille-etal-2020-lets
%X Despite efforts to distinguish three different evaluation setups (Bekoulis et al., 2018), numerous end-to-end Relation Extraction (RE) articles present unreliable performance comparison to previous work. In this paper, we first identify several patterns of invalid comparisons in published papers and describe them to avoid their propagation. We then propose a small empirical study to quantify the most common mistake’s impact and evaluate it leads to overestimating the final RE performance by around 5% on ACE05. We also seize this opportunity to study the unexplored ablations of two recent developments: the use of language model pretraining (specifically BERT) and span-level NER. This meta-analysis emphasizes the need for rigor in the report of both the evaluation setting and the dataset statistics. We finally call for unifying the evaluation setting in end-to-end RE.
%R 10.18653/v1/2020.emnlp-main.301
%U https://aclanthology.org/2020.emnlp-main.301
%U https://doi.org/10.18653/v1/2020.emnlp-main.301
%P 3689-3701
Markdown (Informal)
[Let’s Stop Incorrect Comparisons in End-to-end Relation Extraction!](https://aclanthology.org/2020.emnlp-main.301) (Taillé et al., EMNLP 2020)
ACL