@inproceedings{huang-etal-2024-language,
title = "Language Concept Erasure for Language-invariant Dense Retrieval",
author = "Huang, Zhiqi and
Yu, Puxuan and
Ravfogel, Shauli and
Allan, James",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.emnlp-main.736/",
doi = "10.18653/v1/2024.emnlp-main.736",
pages = "13261--13273",
abstract = "Multilingual models aim for language-invariant representations but still prominently encode language identity. This, along with the scarcity of high-quality parallel retrieval data, limits their performance in retrieval. We introduce LANCER, a multi-task learning framework that improves language-invariant dense retrieval by reducing language-specific signals in the embedding space. Leveraging the notion of linear concept erasure, we design a loss function that penalizes cross-correlation between representations and their language labels. LANCER leverages only English retrieval data and general multilingual corpora, training models to focus on language-invariant retrieval by semantic similarity without necessitating a vast parallel corpus. Experimental results on various datasets show our method consistently improves over baselines, with extensive analyses demonstrating greater language agnosticism."
}
Markdown (Informal)
[Language Concept Erasure for Language-invariant Dense Retrieval](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.emnlp-main.736/) (Huang et al., EMNLP 2024)
ACL