@inproceedings{soni-etal-2026-know,
title = "Know What You See: Grounded localization of product components",
author = "Soni, Manan and
Kanagarajan, Abinesh and
Mohan, Shyam",
editor = "Li, Yunyao and
Rehm, Georg and
Tu, Mei",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-industry.75/",
pages = "1077--1088",
ISBN = "979-8-89176-394-4",
abstract = "Many real-world decisions about products (e.g. how they function, how they should be used) depend on their components rather than the object as a whole. Accurately identifying product component has applications like automated defect detection, visual spare-parts search, and verified assembly. However, existing object detectors treat components as isolated objects, ignoring their inherent structure. We propose Know What You See (KWYS), where we localize components by grounding them using a textual knowledge base (e.g., manuals or web descriptions). KWYS converts raw text into a hierarchical component taxonomy, which then guides an open-vocabulary object detector using a hierarchical verification algorithm. We evaluate on 1,000 product images across 5 diverse categories, improving component localization accuracy by 11{\%} along with reducing component hallucinations by 25{\%}."
}Markdown (Informal)
[Know What You See: Grounded localization of product components](https://preview.aclanthology.org/ingest-acl/2026.acl-industry.75/) (Soni et al., ACL 2026)
ACL
- Manan Soni, Abinesh Kanagarajan, and Shyam Mohan. 2026. Know What You See: Grounded localization of product components. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026), pages 1077–1088, San Diego, California, USA. Association for Computational Linguistics.