@inproceedings{van-der-goot-etal-2022-frustratingly,
title = "Frustratingly Easy Performance Improvements for Low-resource Setups: A Tale on {BERT} and Segment Embeddings",
author = {van der Goot, Rob and
M{\"u}ller-Eberstein, Max and
Plank, Barbara},
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2022.lrec-1.152/",
pages = "1418--1427",
abstract = "As input representation for each sub-word, the original BERT architecture proposes the sum of the sub-word embedding, position embedding and a segment embedding. Sub-word and position embeddings are well-known and studied, and encode lexical information and word position, respectively. In contrast, segment embeddings are less known and have so far received no attention, despite being ubiquitous in large pre-trained language models. The key idea of segment embeddings is to encode to which of the two sentences (segments) a word belongs to {---} the intuition is to inform the model about the separation of sentences for the next sentence prediction pre-training task. However, little is known on whether the choice of segment impacts performance. In this work, we try to fill this gap and empirically study the impact of the segment embedding during inference time for a variety of pre-trained embeddings and target tasks. We hypothesize that for single-sentence prediction tasks performance is not affected {---} neither in mono- nor multilingual setups {---} while it matters when swapping segment IDs in paired-sentence tasks. To our surprise, this is not the case. Although for classification tasks and monolingual BERT models no large differences are observed, particularly word-level multilingual prediction tasks are heavily impacted. For low-resource syntactic tasks, we observe impacts of segment embedding and multilingual BERT choice. We find that the default setting for the most used multilingual BERT model underperforms heavily, and a simple swap of the segment embeddings yields an average improvement of 2.5 points absolute LAS score for dependency parsing over 9 different treebanks."
}
Markdown (Informal)
[Frustratingly Easy Performance Improvements for Low-resource Setups: A Tale on BERT and Segment Embeddings](https://preview.aclanthology.org/jlcl-multiple-ingestion/2022.lrec-1.152/) (van der Goot et al., LREC 2022)
ACL