@inproceedings{malmasi-dras-2017-feature,
title = "Feature Hashing for Language and Dialect Identification",
author = "Malmasi, Shervin and
Dras, Mark",
editor = "Barzilay, Regina and
Kan, Min-Yen",
booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = jul,
year = "2017",
address = "Vancouver, Canada",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/P17-2063/",
doi = "10.18653/v1/P17-2063",
pages = "399--403",
abstract = "We evaluate feature hashing for language identification (LID), a method not previously used for this task. Using a standard dataset, we first show that while feature performance is high, LID data is highly dimensional and mostly sparse ({\ensuremath{>}}99.5{\%}) as it includes large vocabularies for many languages; memory requirements grow as languages are added. Next we apply hashing using various hash sizes, demonstrating that there is no performance loss with dimensionality reductions of up to 86{\%}. We also show that using an ensemble of low-dimension hash-based classifiers further boosts performance. Feature hashing is highly useful for LID and holds great promise for future work in this area."
}
Markdown (Informal)
[Feature Hashing for Language and Dialect Identification](https://preview.aclanthology.org/jlcl-multiple-ingestion/P17-2063/) (Malmasi & Dras, ACL 2017)
ACL
- Shervin Malmasi and Mark Dras. 2017. Feature Hashing for Language and Dialect Identification. In Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pages 399–403, Vancouver, Canada. Association for Computational Linguistics.