@inproceedings{schenner-nordhoff-2016-extracting,
title = "Extracting Interlinear Glossed Text from {L}a{T}e{X} Documents",
author = "Schenner, Mathias and
Nordhoff, Sebastian",
booktitle = "Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC}'16)",
month = may,
year = "2016",
address = "Portoro{\v{z}}, Slovenia",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L16-1638",
pages = "4044--4048",
abstract = "We present texigt, a command-line tool for the extraction of structured linguistic data from LaTeX source documents, and a language resource that has been generated using this tool: a corpus of interlinear glossed text (IGT) extracted from open access books published by Language Science Press. Extracted examples are represented in a simple XML format that is easy to process and can be used to validate certain aspects of interlinear glossed text. The main challenge involved is the parsing of TeX and LaTeX documents. We review why this task is impossible in general and how the texhs Haskell library uses a layered architecture and selective early evaluation (expansion) during lexing and parsing in order to provide access to structured representations of LaTeX documents at several levels. In particular, its parsing modules generate an abstract syntax tree for LaTeX documents after expansion of all user-defined macros and lexer-level commands that serves as an ideal interface for the extraction of interlinear glossed text by texigt. This architecture can easily be adapted to extract other types of linguistic data structures from LaTeX source documents.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="schenner-nordhoff-2016-extracting">
<titleInfo>
<title>Extracting Interlinear Glossed Text from LaTeX Documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mathias</namePart>
<namePart type="family">Schenner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Nordhoff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Portorož, Slovenia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present texigt, a command-line tool for the extraction of structured linguistic data from LaTeX source documents, and a language resource that has been generated using this tool: a corpus of interlinear glossed text (IGT) extracted from open access books published by Language Science Press. Extracted examples are represented in a simple XML format that is easy to process and can be used to validate certain aspects of interlinear glossed text. The main challenge involved is the parsing of TeX and LaTeX documents. We review why this task is impossible in general and how the texhs Haskell library uses a layered architecture and selective early evaluation (expansion) during lexing and parsing in order to provide access to structured representations of LaTeX documents at several levels. In particular, its parsing modules generate an abstract syntax tree for LaTeX documents after expansion of all user-defined macros and lexer-level commands that serves as an ideal interface for the extraction of interlinear glossed text by texigt. This architecture can easily be adapted to extract other types of linguistic data structures from LaTeX source documents.</abstract>
<identifier type="citekey">schenner-nordhoff-2016-extracting</identifier>
<location>
<url>https://aclanthology.org/L16-1638</url>
</location>
<part>
<date>2016-may</date>
<extent unit="page">
<start>4044</start>
<end>4048</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Extracting Interlinear Glossed Text from LaTeX Documents
%A Schenner, Mathias
%A Nordhoff, Sebastian
%S Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)
%D 2016
%8 may
%I European Language Resources Association (ELRA)
%C Portorož, Slovenia
%F schenner-nordhoff-2016-extracting
%X We present texigt, a command-line tool for the extraction of structured linguistic data from LaTeX source documents, and a language resource that has been generated using this tool: a corpus of interlinear glossed text (IGT) extracted from open access books published by Language Science Press. Extracted examples are represented in a simple XML format that is easy to process and can be used to validate certain aspects of interlinear glossed text. The main challenge involved is the parsing of TeX and LaTeX documents. We review why this task is impossible in general and how the texhs Haskell library uses a layered architecture and selective early evaluation (expansion) during lexing and parsing in order to provide access to structured representations of LaTeX documents at several levels. In particular, its parsing modules generate an abstract syntax tree for LaTeX documents after expansion of all user-defined macros and lexer-level commands that serves as an ideal interface for the extraction of interlinear glossed text by texigt. This architecture can easily be adapted to extract other types of linguistic data structures from LaTeX source documents.
%U https://aclanthology.org/L16-1638
%P 4044-4048
Markdown (Informal)
[Extracting Interlinear Glossed Text from LaTeX Documents](https://aclanthology.org/L16-1638) (Schenner & Nordhoff, LREC 2016)
ACL
- Mathias Schenner and Sebastian Nordhoff. 2016. Extracting Interlinear Glossed Text from LaTeX Documents. In Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC'16), pages 4044–4048, Portorož, Slovenia. European Language Resources Association (ELRA).