@inproceedings{nayak-etal-2020-domain,
title = "Domain adaptation challenges of {BERT} in tokenization and sub-word representations of Out-of-Vocabulary words",
author = "Nayak, Anmol and
Timmapathini, Hariprasad and
Ponnalagu, Karthikeyan and
Gopalan Venkoparao, Vijendran",
booktitle = "Proceedings of the First Workshop on Insights from Negative Results in NLP",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.insights-1.1",
doi = "10.18653/v1/2020.insights-1.1",
pages = "1--5",
abstract = "BERT model (Devlin et al., 2019) has achieved significant progress in several Natural Language Processing (NLP) tasks by leveraging the multi-head self-attention mechanism (Vaswani et al., 2017) in its architecture. However, it still has several research challenges which are not tackled well for domain specific corpus found in industries. In this paper, we have highlighted these problems through detailed experiments involving analysis of the attention scores and dynamic word embeddings with the BERT-Base-Uncased model. Our experiments have lead to interesting findings that showed: 1) Largest substring from the left that is found in the vocabulary (in-vocab) is always chosen at every sub-word unit that can lead to suboptimal tokenization choices, 2) Semantic meaning of a vocabulary word deteriorates when found as a substring in an Out-Of-Vocabulary (OOV) word, and 3) Minor misspellings in words are inadequately handled. We believe that if these challenges are tackled, it will significantly help the domain adaptation aspect of BERT.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nayak-etal-2020-domain">
<titleInfo>
<title>Domain adaptation challenges of BERT in tokenization and sub-word representations of Out-of-Vocabulary words</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anmol</namePart>
<namePart type="family">Nayak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hariprasad</namePart>
<namePart type="family">Timmapathini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karthikeyan</namePart>
<namePart type="family">Ponnalagu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vijendran</namePart>
<namePart type="family">Gopalan Venkoparao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Insights from Negative Results in NLP</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>BERT model (Devlin et al., 2019) has achieved significant progress in several Natural Language Processing (NLP) tasks by leveraging the multi-head self-attention mechanism (Vaswani et al., 2017) in its architecture. However, it still has several research challenges which are not tackled well for domain specific corpus found in industries. In this paper, we have highlighted these problems through detailed experiments involving analysis of the attention scores and dynamic word embeddings with the BERT-Base-Uncased model. Our experiments have lead to interesting findings that showed: 1) Largest substring from the left that is found in the vocabulary (in-vocab) is always chosen at every sub-word unit that can lead to suboptimal tokenization choices, 2) Semantic meaning of a vocabulary word deteriorates when found as a substring in an Out-Of-Vocabulary (OOV) word, and 3) Minor misspellings in words are inadequately handled. We believe that if these challenges are tackled, it will significantly help the domain adaptation aspect of BERT.</abstract>
<identifier type="citekey">nayak-etal-2020-domain</identifier>
<identifier type="doi">10.18653/v1/2020.insights-1.1</identifier>
<location>
<url>https://aclanthology.org/2020.insights-1.1</url>
</location>
<part>
<date>2020-nov</date>
<extent unit="page">
<start>1</start>
<end>5</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Domain adaptation challenges of BERT in tokenization and sub-word representations of Out-of-Vocabulary words
%A Nayak, Anmol
%A Timmapathini, Hariprasad
%A Ponnalagu, Karthikeyan
%A Gopalan Venkoparao, Vijendran
%S Proceedings of the First Workshop on Insights from Negative Results in NLP
%D 2020
%8 nov
%I Association for Computational Linguistics
%C Online
%F nayak-etal-2020-domain
%X BERT model (Devlin et al., 2019) has achieved significant progress in several Natural Language Processing (NLP) tasks by leveraging the multi-head self-attention mechanism (Vaswani et al., 2017) in its architecture. However, it still has several research challenges which are not tackled well for domain specific corpus found in industries. In this paper, we have highlighted these problems through detailed experiments involving analysis of the attention scores and dynamic word embeddings with the BERT-Base-Uncased model. Our experiments have lead to interesting findings that showed: 1) Largest substring from the left that is found in the vocabulary (in-vocab) is always chosen at every sub-word unit that can lead to suboptimal tokenization choices, 2) Semantic meaning of a vocabulary word deteriorates when found as a substring in an Out-Of-Vocabulary (OOV) word, and 3) Minor misspellings in words are inadequately handled. We believe that if these challenges are tackled, it will significantly help the domain adaptation aspect of BERT.
%R 10.18653/v1/2020.insights-1.1
%U https://aclanthology.org/2020.insights-1.1
%U https://doi.org/10.18653/v1/2020.insights-1.1
%P 1-5
Markdown (Informal)
[Domain adaptation challenges of BERT in tokenization and sub-word representations of Out-of-Vocabulary words](https://aclanthology.org/2020.insights-1.1) (Nayak et al., insights 2020)
ACL