@inproceedings{min-etal-2025-unihgkr,
title = "{U}ni{HGKR}: Unified Instruction-aware Heterogeneous Knowledge Retrievers",
author = "Min, Dehai and
Xu, Zhiyang and
Qi, Guilin and
Huang, Lifu and
You, Chenyu",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.234/",
pages = "4577--4594",
ISBN = "979-8-89176-189-6",
abstract = "Existing information retrieval (IR) models often assume a homogeneous structure for knowledge sources and user queries, limiting their applicability in real-world settings where retrieval is inherently heterogeneous and diverse. In this paper, we introduce UniHGKR, a unified instruction-aware heterogeneous knowledge retriever that (1) builds a unified retrieval space for heterogeneous knowledge and (2) follows diverse user instructions to retrieve knowledge in specified types. UniHGKR consists of three principal stages, including heterogeneous self-supervised pretraining, text-anchored embedding alignment, and instruction-aware retriever fine-tuning, enabling it to generalize across varied retrieval contexts. This framework is highly scalable, with a BERT-based version and a UniHGKR-7B version trained on large language models. Also, we introduce CompMix-IR, the first native heterogeneous knowledge retrieval benchmark. It includes two retrieval scenarios with various instructions, over 9,400 question answer (QA) pairs, and a corpus of 10 million entries, covering four different types of data. Extensive experiments show that UniHGKR consistently outperform state-of-the-art methods on CompMix-IR, achieving up to 6.36{\%} and 54.23{\%} relative improvements in two scenarios, respectively. Finally, by equipping our retriever for open-domain heterogeneous QA systems, we achieve a new state-of-the-art result on the popular ConvMix task, with an absolute improvement of up to 5.90 points."
}
Markdown (Informal)
[UniHGKR: Unified Instruction-aware Heterogeneous Knowledge Retrievers](https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.234/) (Min et al., NAACL 2025)
ACL
- Dehai Min, Zhiyang Xu, Guilin Qi, Lifu Huang, and Chenyu You. 2025. UniHGKR: Unified Instruction-aware Heterogeneous Knowledge Retrievers. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 4577–4594, Albuquerque, New Mexico. Association for Computational Linguistics.