@inproceedings{al-sharou-etal-2021-towards,
title = "Towards a Better Understanding of Noise in Natural Language Processing",
author = "Al Sharou, Khetam and
Li, Zhenhao and
Specia, Lucia",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)",
month = sep,
year = "2021",
address = "Held Online",
publisher = "INCOMA Ltd.",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.ranlp-1.7/",
pages = "53--62",
abstract = "In this paper, we propose a definition and taxonomy of various types of non-standard textual content {--} generally referred to as ``noise'' {--} in Natural Language Processing (NLP). While data pre-processing is undoubtedly important in NLP, especially when dealing with user-generated content, a broader understanding of different sources of noise and how to deal with them is an aspect that has been largely neglected. We provide a comprehensive list of potential sources of noise, categorise and describe them, and show the impact of a subset of standard pre-processing strategies on different tasks. Our main goal is to raise awareness of non-standard content {--} which should not always be considered as ``noise'' {--} and of the need for careful, task-dependent pre-processing. This is an alternative to blanket, all-encompassing solutions generally applied by researchers through ``standard'' pre-processing pipelines. The intention is for this categorisation to serve as a point of reference to support NLP researchers in devising strategies to clean, normalise or embrace non-standard content."
}
Markdown (Informal)
[Towards a Better Understanding of Noise in Natural Language Processing](https://preview.aclanthology.org/fix-sig-urls/2021.ranlp-1.7/) (Al Sharou et al., RANLP 2021)
ACL