@inproceedings{linder-etal-2020-automatic,
title = "Automatic Creation of Text Corpora for Low-Resource Languages from the {I}nternet: The Case of {S}wiss {G}erman",
author = "Linder, Lucy and
Jungo, Michael and
Hennebert, Jean and
Musat, Claudiu Cristian and
Fischer, Andreas",
booktitle = "Proceedings of the 12th Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.329",
pages = "2706--2711",
abstract = "This paper presents SwissCrawl, the largest Swiss German text corpus to date. Composed of more than half a million sentences, it was generated using a customized web scraping tool that could be applied to other low-resource languages as well. The approach demonstrates how freely available web pages can be used to construct comprehensive text corpora, which are of fundamental importance for natural language processing. In an experimental evaluation, we show that using the new corpus leads to significant improvements for the task of language modeling.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="linder-etal-2020-automatic">
<titleInfo>
<title>Automatic Creation of Text Corpora for Low-Resource Languages from the Internet: The Case of Swiss German</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lucy</namePart>
<namePart type="family">Linder</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Jungo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jean</namePart>
<namePart type="family">Hennebert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudiu</namePart>
<namePart type="given">Cristian</namePart>
<namePart type="family">Musat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Fischer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Language Resources and Evaluation Conference</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>This paper presents SwissCrawl, the largest Swiss German text corpus to date. Composed of more than half a million sentences, it was generated using a customized web scraping tool that could be applied to other low-resource languages as well. The approach demonstrates how freely available web pages can be used to construct comprehensive text corpora, which are of fundamental importance for natural language processing. In an experimental evaluation, we show that using the new corpus leads to significant improvements for the task of language modeling.</abstract>
<identifier type="citekey">linder-etal-2020-automatic</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.329</url>
</location>
<part>
<date>2020-may</date>
<extent unit="page">
<start>2706</start>
<end>2711</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automatic Creation of Text Corpora for Low-Resource Languages from the Internet: The Case of Swiss German
%A Linder, Lucy
%A Jungo, Michael
%A Hennebert, Jean
%A Musat, Claudiu Cristian
%A Fischer, Andreas
%S Proceedings of the 12th Language Resources and Evaluation Conference
%D 2020
%8 may
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F linder-etal-2020-automatic
%X This paper presents SwissCrawl, the largest Swiss German text corpus to date. Composed of more than half a million sentences, it was generated using a customized web scraping tool that could be applied to other low-resource languages as well. The approach demonstrates how freely available web pages can be used to construct comprehensive text corpora, which are of fundamental importance for natural language processing. In an experimental evaluation, we show that using the new corpus leads to significant improvements for the task of language modeling.
%U https://aclanthology.org/2020.lrec-1.329
%P 2706-2711
Markdown (Informal)
[Automatic Creation of Text Corpora for Low-Resource Languages from the Internet: The Case of Swiss German](https://aclanthology.org/2020.lrec-1.329) (Linder et al., LREC 2020)
ACL