@inproceedings{kliche-etal-2014-eidentity,
title = "The e{I}dentity Text Exploration Workbench",
author = "Kliche, Fritz and
Blessing, Andr{\'e} and
Heid, Ulrich and
Sonntag, Jonathan",
booktitle = "Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)",
month = may,
year = "2014",
address = "Reykjavik, Iceland",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2014/pdf/332_Paper.pdf",
pages = "691--697",
abstract = "We work on tools to explore text contents and metadata of newspaper articles as provided by news archives. Our tool components are being integrated into an {``}Exploration Workbench{''} for Digital Humanities researchers. Next to the conversion of different data formats and character encodings, a prominent feature of our design is its {``}Wizard{''} function for corpus building: Researchers import raw data and define patterns to extract text contents and metadata. The Workbench also comprises different tools for data cleaning. These include filtering of off-topic articles, duplicates and near-duplicates, corrupted and empty articles. We currently work on ca. 860.000 newspaper articles from different media archives, provided in different data formats. We index the data with state-of-the-art systems to allow for large scale information retrieval. We extract metadata on publishing dates, author names, newspaper sections, etc., and split articles into segments such as headlines, subtitles, paragraphs, etc. After cleaning the data and compiling a thematically homogeneous corpus, the sample can be used for quantitative analyses which are not affected by noise. Users can retrieve sets of articles on different topics, issues or otherwise defined research questions ({``}subcorpora{''}) and investigate quantitatively their media attention on the timeline ({``}Issue Cycles{''}).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kliche-etal-2014-eidentity">
<titleInfo>
<title>The eIdentity Text Exploration Workbench</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fritz</namePart>
<namePart type="family">Kliche</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">André</namePart>
<namePart type="family">Blessing</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ulrich</namePart>
<namePart type="family">Heid</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonathan</namePart>
<namePart type="family">Sonntag</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2014-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14)</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Reykjavik, Iceland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We work on tools to explore text contents and metadata of newspaper articles as provided by news archives. Our tool components are being integrated into an “Exploration Workbench” for Digital Humanities researchers. Next to the conversion of different data formats and character encodings, a prominent feature of our design is its “Wizard” function for corpus building: Researchers import raw data and define patterns to extract text contents and metadata. The Workbench also comprises different tools for data cleaning. These include filtering of off-topic articles, duplicates and near-duplicates, corrupted and empty articles. We currently work on ca. 860.000 newspaper articles from different media archives, provided in different data formats. We index the data with state-of-the-art systems to allow for large scale information retrieval. We extract metadata on publishing dates, author names, newspaper sections, etc., and split articles into segments such as headlines, subtitles, paragraphs, etc. After cleaning the data and compiling a thematically homogeneous corpus, the sample can be used for quantitative analyses which are not affected by noise. Users can retrieve sets of articles on different topics, issues or otherwise defined research questions (“subcorpora”) and investigate quantitatively their media attention on the timeline (“Issue Cycles”).</abstract>
<identifier type="citekey">kliche-etal-2014-eidentity</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2014/pdf/332_Paper.pdf</url>
</location>
<part>
<date>2014-may</date>
<extent unit="page">
<start>691</start>
<end>697</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The eIdentity Text Exploration Workbench
%A Kliche, Fritz
%A Blessing, André
%A Heid, Ulrich
%A Sonntag, Jonathan
%S Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14)
%D 2014
%8 may
%I European Language Resources Association (ELRA)
%C Reykjavik, Iceland
%F kliche-etal-2014-eidentity
%X We work on tools to explore text contents and metadata of newspaper articles as provided by news archives. Our tool components are being integrated into an “Exploration Workbench” for Digital Humanities researchers. Next to the conversion of different data formats and character encodings, a prominent feature of our design is its “Wizard” function for corpus building: Researchers import raw data and define patterns to extract text contents and metadata. The Workbench also comprises different tools for data cleaning. These include filtering of off-topic articles, duplicates and near-duplicates, corrupted and empty articles. We currently work on ca. 860.000 newspaper articles from different media archives, provided in different data formats. We index the data with state-of-the-art systems to allow for large scale information retrieval. We extract metadata on publishing dates, author names, newspaper sections, etc., and split articles into segments such as headlines, subtitles, paragraphs, etc. After cleaning the data and compiling a thematically homogeneous corpus, the sample can be used for quantitative analyses which are not affected by noise. Users can retrieve sets of articles on different topics, issues or otherwise defined research questions (“subcorpora”) and investigate quantitatively their media attention on the timeline (“Issue Cycles”).
%U http://www.lrec-conf.org/proceedings/lrec2014/pdf/332_Paper.pdf
%P 691-697
Markdown (Informal)
[The eIdentity Text Exploration Workbench](http://www.lrec-conf.org/proceedings/lrec2014/pdf/332_Paper.pdf) (Kliche et al., LREC 2014)
ACL
- Fritz Kliche, André Blessing, Ulrich Heid, and Jonathan Sonntag. 2014. The eIdentity Text Exploration Workbench. In Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC'14), pages 691–697, Reykjavik, Iceland. European Language Resources Association (ELRA).