@inproceedings{erjavec-1999-elan,
title = "The {ELAN} {S}lovene-{E}nglish aligned corpus",
author = "Erjavec, Tomaz",
booktitle = "Proceedings of Machine Translation Summit VII",
month = sep # " 13-17",
year = "1999",
address = "Singapore, Singapore",
url = "https://aclanthology.org/1999.mtsummit-1.51",
pages = "349--357",
abstract = "Multilingual parallel corpora are a basic resource for research and development of MT. Such corpora are still scarce, especially for lower-diffusion languages. The paper presents a sentence-aligned tokenised Slovene-English corpus, developed in the scope of the EU ELAN project. The corpus contains 1 million words from fifteen recent terminology-rich texts and is encoded according to the Guidelines for Text Encoding and Interchange (TEI). Our document type definition is a parametrisation of the TEI which directly encodes translation units of the bi-texts. in a manner similar to that of translation memories. The corpus is aimed as a widely-distributable dataset for language engineering and for translation and terminology studies. The paper describes the compilation of the corpus, its composition, encoding and availability. We highlight the corpus acquisition and distribution bottlenecks and present our solutions. These have to do with the workflow in the project, and. not unrelatedly, with the encoding scheme for the corpus.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="erjavec-1999-elan">
<titleInfo>
<title>The ELAN Slovene-English aligned corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tomaz</namePart>
<namePart type="family">Erjavec</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>1999-sep" 13-17"</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of Machine Translation Summit VII</title>
</titleInfo>
<originInfo>
<place>
<placeTerm type="text">Singapore, Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multilingual parallel corpora are a basic resource for research and development of MT. Such corpora are still scarce, especially for lower-diffusion languages. The paper presents a sentence-aligned tokenised Slovene-English corpus, developed in the scope of the EU ELAN project. The corpus contains 1 million words from fifteen recent terminology-rich texts and is encoded according to the Guidelines for Text Encoding and Interchange (TEI). Our document type definition is a parametrisation of the TEI which directly encodes translation units of the bi-texts. in a manner similar to that of translation memories. The corpus is aimed as a widely-distributable dataset for language engineering and for translation and terminology studies. The paper describes the compilation of the corpus, its composition, encoding and availability. We highlight the corpus acquisition and distribution bottlenecks and present our solutions. These have to do with the workflow in the project, and. not unrelatedly, with the encoding scheme for the corpus.</abstract>
<identifier type="citekey">erjavec-1999-elan</identifier>
<location>
<url>https://aclanthology.org/1999.mtsummit-1.51</url>
</location>
<part>
<date>1999-sep" 13-17"</date>
<extent unit="page">
<start>349</start>
<end>357</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The ELAN Slovene-English aligned corpus
%A Erjavec, Tomaz
%S Proceedings of Machine Translation Summit VII
%D 1999
%8 sep" 13 17"
%C Singapore, Singapore
%F erjavec-1999-elan
%X Multilingual parallel corpora are a basic resource for research and development of MT. Such corpora are still scarce, especially for lower-diffusion languages. The paper presents a sentence-aligned tokenised Slovene-English corpus, developed in the scope of the EU ELAN project. The corpus contains 1 million words from fifteen recent terminology-rich texts and is encoded according to the Guidelines for Text Encoding and Interchange (TEI). Our document type definition is a parametrisation of the TEI which directly encodes translation units of the bi-texts. in a manner similar to that of translation memories. The corpus is aimed as a widely-distributable dataset for language engineering and for translation and terminology studies. The paper describes the compilation of the corpus, its composition, encoding and availability. We highlight the corpus acquisition and distribution bottlenecks and present our solutions. These have to do with the workflow in the project, and. not unrelatedly, with the encoding scheme for the corpus.
%U https://aclanthology.org/1999.mtsummit-1.51
%P 349-357
Markdown (Informal)
[The ELAN Slovene-English aligned corpus](https://aclanthology.org/1999.mtsummit-1.51) (Erjavec, MTSummit 1999)
ACL