@inproceedings{volk-etal-2010-challenges,
title = "Challenges in Building a Multilingual Alpine Heritage Corpus",
author = "Volk, Martin and
Bubenhofer, Noah and
Althaus, Adrian and
Bangerter, Maya and
Furrer, Lenz and
Ruef, Beni",
booktitle = "Proceedings of the Seventh International Conference on Language Resources and Evaluation ({LREC}'10)",
month = may,
year = "2010",
address = "Valletta, Malta",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2010/pdf/110_Paper.pdf",
abstract = "This paper describes our efforts to build a multilingual heritage corpus of alpine texts. Currently we digitize the yearbooks of the Swiss Alpine Club which contain articles in French, German, Italian and Romansch. Articles comprise mountaineering reports from all corners of the earth, but also scientific topics such as topography, geology or glacierology as well as occasional poetry and lyrics. We have already scanned close to 70,000 pages which has resulted in a corpus of 25 million words, 10{\%} of which is a parallel French-German corpus. We have solved a number of challenges in automatic language identification and text structure recognition. Our next goal is to identify the great variety of toponyms (e.g. names of mountains and valleys, glaciers and rivers, trails and cabins) in this corpus, and we sketch how a large gazetteer of Swiss topographical names can be exploited for this purpose. Despite the size of the resource, exact matching leads to a low recall because of spelling variations, language mixtures and partial repetitions.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="volk-etal-2010-challenges">
<titleInfo>
<title>Challenges in Building a Multilingual Alpine Heritage Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Martin</namePart>
<namePart type="family">Volk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Noah</namePart>
<namePart type="family">Bubenhofer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adrian</namePart>
<namePart type="family">Althaus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maya</namePart>
<namePart type="family">Bangerter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lenz</namePart>
<namePart type="family">Furrer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Beni</namePart>
<namePart type="family">Ruef</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2010-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Valletta, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes our efforts to build a multilingual heritage corpus of alpine texts. Currently we digitize the yearbooks of the Swiss Alpine Club which contain articles in French, German, Italian and Romansch. Articles comprise mountaineering reports from all corners of the earth, but also scientific topics such as topography, geology or glacierology as well as occasional poetry and lyrics. We have already scanned close to 70,000 pages which has resulted in a corpus of 25 million words, 10% of which is a parallel French-German corpus. We have solved a number of challenges in automatic language identification and text structure recognition. Our next goal is to identify the great variety of toponyms (e.g. names of mountains and valleys, glaciers and rivers, trails and cabins) in this corpus, and we sketch how a large gazetteer of Swiss topographical names can be exploited for this purpose. Despite the size of the resource, exact matching leads to a low recall because of spelling variations, language mixtures and partial repetitions.</abstract>
<identifier type="citekey">volk-etal-2010-challenges</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2010/pdf/110_Paper.pdf</url>
</location>
<part>
<date>2010-may</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Challenges in Building a Multilingual Alpine Heritage Corpus
%A Volk, Martin
%A Bubenhofer, Noah
%A Althaus, Adrian
%A Bangerter, Maya
%A Furrer, Lenz
%A Ruef, Beni
%S Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)
%D 2010
%8 may
%I European Language Resources Association (ELRA)
%C Valletta, Malta
%F volk-etal-2010-challenges
%X This paper describes our efforts to build a multilingual heritage corpus of alpine texts. Currently we digitize the yearbooks of the Swiss Alpine Club which contain articles in French, German, Italian and Romansch. Articles comprise mountaineering reports from all corners of the earth, but also scientific topics such as topography, geology or glacierology as well as occasional poetry and lyrics. We have already scanned close to 70,000 pages which has resulted in a corpus of 25 million words, 10% of which is a parallel French-German corpus. We have solved a number of challenges in automatic language identification and text structure recognition. Our next goal is to identify the great variety of toponyms (e.g. names of mountains and valleys, glaciers and rivers, trails and cabins) in this corpus, and we sketch how a large gazetteer of Swiss topographical names can be exploited for this purpose. Despite the size of the resource, exact matching leads to a low recall because of spelling variations, language mixtures and partial repetitions.
%U http://www.lrec-conf.org/proceedings/lrec2010/pdf/110_Paper.pdf
Markdown (Informal)
[Challenges in Building a Multilingual Alpine Heritage Corpus](http://www.lrec-conf.org/proceedings/lrec2010/pdf/110_Paper.pdf) (Volk et al., LREC 2010)
ACL
- Martin Volk, Noah Bubenhofer, Adrian Althaus, Maya Bangerter, Lenz Furrer, and Beni Ruef. 2010. Challenges in Building a Multilingual Alpine Heritage Corpus. In Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC'10), Valletta, Malta. European Language Resources Association (ELRA).