@inproceedings{rama-coltekin-2017-fewer,
title = "Fewer features perform well at Native Language Identification task",
author = {Rama, Taraka and
{\c{C}}{\"o}ltekin, {\c{C}}a{\u{g}}r{\i}},
editor = "Tetreault, Joel and
Burstein, Jill and
Leacock, Claudia and
Yannakoudakis, Helen",
booktitle = "Proceedings of the 12th Workshop on Innovative Use of {NLP} for Building Educational Applications",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/W17-5028/",
doi = "10.18653/v1/W17-5028",
pages = "255--260",
abstract = "This paper describes our results at the NLI shared task 2017. We participated in essays, speech, and fusion task that uses text, speech, and i-vectors for the task of identifying the native language of the given input. In the essay track, a linear SVM system using word bigrams and character 7-grams performed the best. In the speech track, an LDA classifier based only on i-vectors performed better than a combination system using text features from speech transcriptions and i-vectors. In the fusion task, we experimented with systems that used combination of i-vectors with higher order n-grams features, combination of i-vectors with word unigrams, a mean probability ensemble, and a stacked ensemble system. Our finding is that word unigrams in combination with i-vectors achieve higher score than systems trained with larger number of $n$-gram features. Our best-performing systems achieved F1-scores of 87.16{\%}, 83.33{\%} and 91.75{\%} on the essay track, the speech track and the fusion track respectively."
}
Markdown (Informal)
[Fewer features perform well at Native Language Identification task](https://preview.aclanthology.org/fix-sig-urls/W17-5028/) (Rama & Çöltekin, BEA 2017)
ACL