@inproceedings{sahay-etal-2026-mirage,
title = "{MIRAGE}: Metadata-guided Image Retrieval and Answer Generation for {E}-commerce Troubleshooting",
author = "Sahay, Rishav and
Tekumalla, Lavanya Sita and
Saladi, Anoop",
editor = {Matusevych, Yevgen and
Eryi{\u{g}}it, G{\"u}l{\c{s}}en and
Aletras, Nikolaos},
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 5: Industry Track)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-industry.56/",
pages = "764--776",
ISBN = "979-8-89176-384-5",
abstract = "Existing multimodal systems typically associate text and available images based on embedding similarity or simple co-location, but such approaches often fail to ensure that the linked image accurately depicts the specific product or component mentioned in a troubleshooting instruction. We introduce MIRAGE, a metadata-first paradigm that treats structured metadata, (not raw pixels), as a first-class modality for multimodal grounding. In MIRAGE, both text and images are projected through a shared semantic schema capturing product attributes, context, and visual aspects, enabling reasoning over interpretable attributes for troubleshooting rather than unstructured embeddings. MIRAGE comprises of three complementary modules: M-Link for schema-guided image{--}text linking, M-Gen for metadata-conditioned multimodal generation, and M-Eval for consistency evaluation in the same structured space. Experiments on large-scale enterprise e-commerce troubleshooting data across 10 product types on 100K text chunks and 35K images show that metadata-centric grounding achieves over 40{\%} higher linking coverage of high-quality visual content and over 45{\%} in linking and response quality than embedding-based baselines. MIRAGE demonstrates the potential of structured metadata in enabling scalable, fine-grained grounding in multimodal troubleshooting systems."
}Markdown (Informal)
[MIRAGE: Metadata-guided Image Retrieval and Answer Generation for E-commerce Troubleshooting](https://preview.aclanthology.org/ingest-eacl/2026.eacl-industry.56/) (Sahay et al., EACL 2026)
ACL