@inproceedings{kennington-2021-enriching,
title = "Enriching Language Models with Visually-grounded Word Vectors and the {L}ancaster Sensorimotor Norms",
author = "Kennington, Casey",
booktitle = "Proceedings of the 25th Conference on Computational Natural Language Learning",
month = nov,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.conll-1.11",
doi = "10.18653/v1/2021.conll-1.11",
pages = "148--157",
abstract = "Language models are trained only on text despite the fact that humans learn their first language in a highly interactive and multimodal environment where the first set of learned words are largely concrete, denoting physical entities and embodied states. To enrich language models with some of this missing experience, we leverage two sources of information: (1) the Lancaster Sensorimotor norms, which provide ratings (means and standard deviations) for over 40,000 English words along several dimensions of embodiment, and which capture the extent to which something is experienced across 11 different sensory modalities, and (2) vectors from coefficients of binary classifiers trained on images for the BERT vocabulary. We pre-trained the ELECTRA model and fine-tuned the RoBERTa model with these two sources of information then evaluate using the established GLUE benchmark and the Visual Dialog benchmark. We find that enriching language models with the Lancaster norms and image vectors improves results in both tasks, with some implications for robust language models that capture holistic linguistic meaning in a language learning context.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kennington-2021-enriching">
<titleInfo>
<title>Enriching Language Models with Visually-grounded Word Vectors and the Lancaster Sensorimotor Norms</title>
</titleInfo>
<name type="personal">
<namePart type="given">Casey</namePart>
<namePart type="family">Kennington</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 25th Conference on Computational Natural Language Learning</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Language models are trained only on text despite the fact that humans learn their first language in a highly interactive and multimodal environment where the first set of learned words are largely concrete, denoting physical entities and embodied states. To enrich language models with some of this missing experience, we leverage two sources of information: (1) the Lancaster Sensorimotor norms, which provide ratings (means and standard deviations) for over 40,000 English words along several dimensions of embodiment, and which capture the extent to which something is experienced across 11 different sensory modalities, and (2) vectors from coefficients of binary classifiers trained on images for the BERT vocabulary. We pre-trained the ELECTRA model and fine-tuned the RoBERTa model with these two sources of information then evaluate using the established GLUE benchmark and the Visual Dialog benchmark. We find that enriching language models with the Lancaster norms and image vectors improves results in both tasks, with some implications for robust language models that capture holistic linguistic meaning in a language learning context.</abstract>
<identifier type="citekey">kennington-2021-enriching</identifier>
<identifier type="doi">10.18653/v1/2021.conll-1.11</identifier>
<location>
<url>https://aclanthology.org/2021.conll-1.11</url>
</location>
<part>
<date>2021-nov</date>
<extent unit="page">
<start>148</start>
<end>157</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Enriching Language Models with Visually-grounded Word Vectors and the Lancaster Sensorimotor Norms
%A Kennington, Casey
%S Proceedings of the 25th Conference on Computational Natural Language Learning
%D 2021
%8 nov
%I Association for Computational Linguistics
%C Online
%F kennington-2021-enriching
%X Language models are trained only on text despite the fact that humans learn their first language in a highly interactive and multimodal environment where the first set of learned words are largely concrete, denoting physical entities and embodied states. To enrich language models with some of this missing experience, we leverage two sources of information: (1) the Lancaster Sensorimotor norms, which provide ratings (means and standard deviations) for over 40,000 English words along several dimensions of embodiment, and which capture the extent to which something is experienced across 11 different sensory modalities, and (2) vectors from coefficients of binary classifiers trained on images for the BERT vocabulary. We pre-trained the ELECTRA model and fine-tuned the RoBERTa model with these two sources of information then evaluate using the established GLUE benchmark and the Visual Dialog benchmark. We find that enriching language models with the Lancaster norms and image vectors improves results in both tasks, with some implications for robust language models that capture holistic linguistic meaning in a language learning context.
%R 10.18653/v1/2021.conll-1.11
%U https://aclanthology.org/2021.conll-1.11
%U https://doi.org/10.18653/v1/2021.conll-1.11
%P 148-157
Markdown (Informal)
[Enriching Language Models with Visually-grounded Word Vectors and the Lancaster Sensorimotor Norms](https://aclanthology.org/2021.conll-1.11) (Kennington, CoNLL 2021)
ACL