@inproceedings{tan-etal-2020-mind,
title = "Mind Your Inflections! {I}mproving {NLP} for Non-Standard {E}nglishes with {B}ase-{I}nflection {E}ncoding",
author = "Tan, Samson and
Joty, Shafiq and
Varshney, Lav and
Kan, Min-Yen",
editor = "Webber, Bonnie and
Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2020.emnlp-main.455/",
doi = "10.18653/v1/2020.emnlp-main.455",
pages = "5647--5663",
abstract = "Inflectional variation is a common feature of World Englishes such as Colloquial Singapore English and African American Vernacular English. Although comprehension by human readers is usually unimpaired by non-standard inflections, current NLP systems are not yet robust. We propose Base-Inflection Encoding (BITE), a method to tokenize English text by reducing inflected words to their base forms before reinjecting the grammatical information as special symbols. Fine-tuning pretrained NLP models for downstream tasks using our encoding defends against inflectional adversaries while maintaining performance on clean data. Models using BITE generalize better to dialects with non-standard inflections without explicit training and translation models converge faster when trained with BITE. Finally, we show that our encoding improves the vocabulary efficiency of popular data-driven subword tokenizers. Since there has been no prior work on quantitatively evaluating vocabulary efficiency, we propose metrics to do so."
}
Markdown (Informal)
[Mind Your Inflections! Improving NLP for Non-Standard Englishes with Base-Inflection Encoding](https://preview.aclanthology.org/fix-sig-urls/2020.emnlp-main.455/) (Tan et al., EMNLP 2020)
ACL