@inproceedings{wang-etal-2025-rav,
title = "{RAV}: Retrieval-Augmented Voting for Tactile Descriptions Without Training",
author = "Wang, Jinlin and
Ji, Yulong and
Yang, Hongyu",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.315/",
pages = "6198--6205",
ISBN = "979-8-89176-332-6",
abstract = "Tactile perception is essential for human-environment interaction, and deriving tactile descriptions from multimodal data is a key challenge for embodied intelligence to understand human perception. Conventional approaches relying on extensive parameter learning for multimodal perception are rigid and computationally inefficient. To address this, we introduce Retrieval-Augmented Voting (RAV), a parameter-free method that constructs visual-tactile cross-modal knowledge directly. RAV retrieves similar visual-tactile data for given visual and tactile inputs and generates tactile descriptions through a voting mechanism. In experiments, we applied three voting strategies, SyncVote, DualVote and WeightVote, achieving performance comparable to large-scale cross-modal models without training. Comparative experiments across datasets of varying quality{---}defined by annotation accuracy and data diversity{---}demonstrate that RAV{'}s performance improves with higher-quality data at no additional computational cost. Code, and model checkpoints are opensourced at https://github.com/PluteW/RAV."
}Markdown (Informal)
[RAV: Retrieval-Augmented Voting for Tactile Descriptions Without Training](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.315/) (Wang et al., EMNLP 2025)
ACL