@inproceedings{yi-ounis-2025-multi,
title = "A Multi-modal Large Language Model with Graph-of-Thought for Effective Recommendation",
author = "Yi, Zixuan and
Ounis, Iadh",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.76/",
pages = "1591--1606",
ISBN = "979-8-89176-189-6",
abstract = "Chain-of-Thought (CoT) prompting has been shown to be effective in guiding Large Language Models (LLMs) to decompose complex tasks into multiple intermediate steps, and constructing a rational reasoning chain for inferring answers. However, the linear nature of CoT falls short from enabling LLMs to effectively handle graph structures, which are essential for personalized recommendation tasks that rely on user-item interaction graphs. To bridge this gap, we introduce GollaRec, which leverages a Graph-of-Thought (GoT) prompting technique in a Multi-modal LLM, namely LLaVA, to effectively exploit the complex structure of the interaction graphs. GollaRec enhances the recommendation effectiveness by integrating both visual and textual ``thoughts'' into a graph-structured prompt, using both item images and descriptions to produce richer multi-modal user/item representations. In our proposed approach, GollaRec leverages text-graph alignment and graph instruction tuning to allow the Multi-modal LLM to capture complex graph structures. In addition, GollaRec leverages a graph adaptor to integrate user-item interactions into the resulting user/item embeddings, therefore effectively adapting the model to the recommendation task. Our extensive experiments on 6 benchmark datasets demonstrate the superiority of our proposed GollaRec model over 12 existing state-of-the-art models in various multi-modal recommendation tasks, including general and multi-domain recommendation tasks."
}
Markdown (Informal)
[A Multi-modal Large Language Model with Graph-of-Thought for Effective Recommendation](https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.76/) (Yi & Ounis, NAACL 2025)
ACL