@inproceedings{chen-etal-2023-bla,
title = "The {BLA} Benchmark: Investigating Basic Language Abilities of Pre-Trained Multimodal Models",
author = "Chen, Xinyi and
Fern{\'a}ndez, Raquel and
Pezzelle, Sandro",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2023.emnlp-main.356/",
doi = "10.18653/v1/2023.emnlp-main.356",
pages = "5817--5830",
abstract = "Despite the impressive performance achieved by pre-trained language-and-vision models in downstream tasks, it remains an open question whether this reflects a proper understanding of image-text interaction. In this work, we explore to what extent they handle basic linguistic constructions{---}active-passive voice, coordination, and relative clauses{---}that even preschool children can typically master. We present BLA, a novel, automatically constructed benchmark to evaluate multimodal models on these Basic Language Abilities. We show that different types of Transformer-based systems, such as CLIP, ViLBERT, and BLIP2, generally struggle with BLA in a zero-shot setting, in line with previous findings. Our experiments, in particular, show that most of the tested models only marginally benefit when fine-tuned or prompted with construction-specific samples. Yet, the generative BLIP2 shows promising trends, especially in an in-context learning setting. This opens the door to using BLA not only as an evaluation benchmark but also to improve models' basic language abilities."
}
Markdown (Informal)
[The BLA Benchmark: Investigating Basic Language Abilities of Pre-Trained Multimodal Models](https://preview.aclanthology.org/add-emnlp-2024-awards/2023.emnlp-main.356/) (Chen et al., EMNLP 2023)
ACL