@inproceedings{lefever-etal-2020-identifying,
title = "Identifying Cognates in {E}nglish-{D}utch and {F}rench-{D}utch by means of Orthographic Information and Cross-lingual Word Embeddings",
author = "Lefever, Els and
Labat, Sofie and
Singh, Pranaydeep",
booktitle = "Proceedings of the 12th Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.504",
pages = "4096--4101",
abstract = "This paper investigates the validity of combining more traditional orthographic information with cross-lingual word embeddings to identify cognate pairs in English-Dutch and French-Dutch. In a first step, lists of potential cognate pairs in English-Dutch and French-Dutch are manually labelled. The resulting gold standard is used to train and evaluate a multi-layer perceptron that can distinguish cognates from non-cognates. Fifteen orthographic features capture string similarities between source and target words, while the cosine similarity between their word embeddings represents the semantic relation between these words. By adding domain-specific information to pretrained fastText embeddings, we are able to obtain good embeddings for words that did not yet have a pretrained embedding (e.g. Dutch compound nouns). These embeddings are then aligned in a cross-lingual vector space by exploiting their structural similarity (cf. adversarial learning). Our results indicate that although the classifier already achieves good results on the basis of orthographic information, the performance further improves by including semantic information in the form of cross-lingual word embeddings.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lefever-etal-2020-identifying">
<titleInfo>
<title>Identifying Cognates in English-Dutch and French-Dutch by means of Orthographic Information and Cross-lingual Word Embeddings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Els</namePart>
<namePart type="family">Lefever</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sofie</namePart>
<namePart type="family">Labat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pranaydeep</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Language Resources and Evaluation Conference</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>This paper investigates the validity of combining more traditional orthographic information with cross-lingual word embeddings to identify cognate pairs in English-Dutch and French-Dutch. In a first step, lists of potential cognate pairs in English-Dutch and French-Dutch are manually labelled. The resulting gold standard is used to train and evaluate a multi-layer perceptron that can distinguish cognates from non-cognates. Fifteen orthographic features capture string similarities between source and target words, while the cosine similarity between their word embeddings represents the semantic relation between these words. By adding domain-specific information to pretrained fastText embeddings, we are able to obtain good embeddings for words that did not yet have a pretrained embedding (e.g. Dutch compound nouns). These embeddings are then aligned in a cross-lingual vector space by exploiting their structural similarity (cf. adversarial learning). Our results indicate that although the classifier already achieves good results on the basis of orthographic information, the performance further improves by including semantic information in the form of cross-lingual word embeddings.</abstract>
<identifier type="citekey">lefever-etal-2020-identifying</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.504</url>
</location>
<part>
<date>2020-may</date>
<extent unit="page">
<start>4096</start>
<end>4101</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Identifying Cognates in English-Dutch and French-Dutch by means of Orthographic Information and Cross-lingual Word Embeddings
%A Lefever, Els
%A Labat, Sofie
%A Singh, Pranaydeep
%S Proceedings of the 12th Language Resources and Evaluation Conference
%D 2020
%8 may
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F lefever-etal-2020-identifying
%X This paper investigates the validity of combining more traditional orthographic information with cross-lingual word embeddings to identify cognate pairs in English-Dutch and French-Dutch. In a first step, lists of potential cognate pairs in English-Dutch and French-Dutch are manually labelled. The resulting gold standard is used to train and evaluate a multi-layer perceptron that can distinguish cognates from non-cognates. Fifteen orthographic features capture string similarities between source and target words, while the cosine similarity between their word embeddings represents the semantic relation between these words. By adding domain-specific information to pretrained fastText embeddings, we are able to obtain good embeddings for words that did not yet have a pretrained embedding (e.g. Dutch compound nouns). These embeddings are then aligned in a cross-lingual vector space by exploiting their structural similarity (cf. adversarial learning). Our results indicate that although the classifier already achieves good results on the basis of orthographic information, the performance further improves by including semantic information in the form of cross-lingual word embeddings.
%U https://aclanthology.org/2020.lrec-1.504
%P 4096-4101
Markdown (Informal)
[Identifying Cognates in English-Dutch and French-Dutch by means of Orthographic Information and Cross-lingual Word Embeddings](https://aclanthology.org/2020.lrec-1.504) (Lefever et al., LREC 2020)
ACL