@inproceedings{tang-jacob-2025-language,
title = "Language as a Label: Zero-Shot Multimodal Classification of Everyday Postures under Data Scarcity",
author = "Tang, Ming Ze and
Jacob, Jubal Chandy",
editor = "Shukla, Ankita and
Kumar, Sandeep and
Bedi, Amrit Singh and
Chakraborty, Tanmoy",
booktitle = "Proceedings of the 1st Workshop on Multimodal Models for Low-Resource Contexts and Social Impact (MMLoSo 2025)",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.mmloso-1.5/",
pages = "48--57",
ISBN = "979-8-89176-311-1",
abstract = "This paper investigates how the specificity of natural language prompts influences zero-shot classification performance in modern vision language models (VLMs) under severe data scarcity. Using a curated 285 image subset of MS COCO containing three everyday postures (sitting, standing, and walking/running), we evaluate OpenCLIP, MetaCLIP2, and SigLIP alongside unimodal and pose-based baselines. We introduce a three tier prompt design, minimal labels, action cues, and compact geometric descriptions and systematically vary only the linguistic detail. Our results reveal a counterintuitive trend where simpler prompts consistently outperform more detailed ones, a phenomenon we term prompt overfitting. Grad-CAM attribution further shows that prompt specificity shifts attention between contextual and pose-relevant regions, explaining the model dependent behaviour. The study provides a controlled analysis of prompt granularity in low resource image based posture recognition, highlights the need for careful prompt design when labels are scarce."
}Markdown (Informal)
[Language as a Label: Zero-Shot Multimodal Classification of Everyday Postures under Data Scarcity](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.mmloso-1.5/) (Tang & Jacob, MMLoSo 2025)
ACL