@inproceedings{eriksson-2016-quality,
title = "Quality Assessment of the {R}euters Vol. 2 Multilingual Corpus",
author = "Eriksson, Robin",
booktitle = "Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC}'16)",
month = may,
year = "2016",
address = "Portoro{\v{z}}, Slovenia",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L16-1286",
pages = "1813--1819",
abstract = "We introduce a framework for quality assurance of corpora, and apply it to the Reuters Multilingual Corpus (RCV2). The results of this quality assessment of this standard newsprint corpus reveal a significant duplication problem and, to a lesser extent, a problem with corrupted articles. From the raw collection of some 487,000 articles, almost one tenth are trivial duplicates. A smaller fraction of articles appear to be corrupted and should be excluded for that reason. The detailed results are being made available as on-line appendices to this article. This effort also demonstrates the beginnings of a constraint-based methodological framework for quality assessment and quality assurance for corpora. As a first implementation of this framework, we have investigated constraints to verify sample integrity, and to diagnose sample duplication, entropy aberrations, and tagging inconsistencies. To help identify near-duplicates in the corpus, we have employed both entropy measurements and a simple byte bigram incidence digest.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="eriksson-2016-quality">
<titleInfo>
<title>Quality Assessment of the Reuters Vol. 2 Multilingual Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Robin</namePart>
<namePart type="family">Eriksson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Portorož, Slovenia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We introduce a framework for quality assurance of corpora, and apply it to the Reuters Multilingual Corpus (RCV2). The results of this quality assessment of this standard newsprint corpus reveal a significant duplication problem and, to a lesser extent, a problem with corrupted articles. From the raw collection of some 487,000 articles, almost one tenth are trivial duplicates. A smaller fraction of articles appear to be corrupted and should be excluded for that reason. The detailed results are being made available as on-line appendices to this article. This effort also demonstrates the beginnings of a constraint-based methodological framework for quality assessment and quality assurance for corpora. As a first implementation of this framework, we have investigated constraints to verify sample integrity, and to diagnose sample duplication, entropy aberrations, and tagging inconsistencies. To help identify near-duplicates in the corpus, we have employed both entropy measurements and a simple byte bigram incidence digest.</abstract>
<identifier type="citekey">eriksson-2016-quality</identifier>
<location>
<url>https://aclanthology.org/L16-1286</url>
</location>
<part>
<date>2016-may</date>
<extent unit="page">
<start>1813</start>
<end>1819</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Quality Assessment of the Reuters Vol. 2 Multilingual Corpus
%A Eriksson, Robin
%S Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)
%D 2016
%8 may
%I European Language Resources Association (ELRA)
%C Portorož, Slovenia
%F eriksson-2016-quality
%X We introduce a framework for quality assurance of corpora, and apply it to the Reuters Multilingual Corpus (RCV2). The results of this quality assessment of this standard newsprint corpus reveal a significant duplication problem and, to a lesser extent, a problem with corrupted articles. From the raw collection of some 487,000 articles, almost one tenth are trivial duplicates. A smaller fraction of articles appear to be corrupted and should be excluded for that reason. The detailed results are being made available as on-line appendices to this article. This effort also demonstrates the beginnings of a constraint-based methodological framework for quality assessment and quality assurance for corpora. As a first implementation of this framework, we have investigated constraints to verify sample integrity, and to diagnose sample duplication, entropy aberrations, and tagging inconsistencies. To help identify near-duplicates in the corpus, we have employed both entropy measurements and a simple byte bigram incidence digest.
%U https://aclanthology.org/L16-1286
%P 1813-1819
Markdown (Informal)
[Quality Assessment of the Reuters Vol. 2 Multilingual Corpus](https://aclanthology.org/L16-1286) (Eriksson, LREC 2016)
ACL