@inproceedings{han-tsvetkov-2021-influence-tuning,
title = "Influence Tuning: Demoting Spurious Correlations via Instance Attribution and Instance-Driven Updates",
author = "Han, Xiaochuang and
Tsvetkov, Yulia",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2021",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2021.findings-emnlp.374/",
doi = "10.18653/v1/2021.findings-emnlp.374",
pages = "4398--4409",
abstract = "Among the most critical limitations of deep learning NLP models are their lack of interpretability, and their reliance on spurious correlations. Prior work proposed various approaches to interpreting the black-box models to unveil the spurious correlations, but the research was primarily used in human-computer interaction scenarios. It still remains underexplored whether or how such model interpretations can be used to automatically {\textquotedblleft}unlearn{\textquotedblright} confounding features. In this work, we propose influence tuning{---}a procedure that leverages model interpretations to update the model parameters towards a plausible interpretation (rather than an interpretation that relies on spurious patterns in the data) in addition to learning to predict the task labels. We show that in a controlled setup, influence tuning can help deconfounding the model from spurious patterns in data, significantly outperforming baseline methods that use adversarial training."
}
Markdown (Informal)
[Influence Tuning: Demoting Spurious Correlations via Instance Attribution and Instance-Driven Updates](https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2021.findings-emnlp.374/) (Han & Tsvetkov, Findings 2021)
ACL