@inproceedings{min-etal-2025-understanding,
title = "Understanding Impact of Human Feedback via Influence Functions",
author = "Min, Taywon and
Lee, Haeone and
Kwon, Yongchan and
Lee, Kimin",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1333/",
pages = "27471--27500",
ISBN = "979-8-89176-251-0",
abstract = "In Reinforcement Learning from Human Feedback (RLHF), it is crucial to learn suitable reward models from human feedback to align large language models (LLMs) with human intentions. However, human feedback can often be noisy, inconsistent, or biased, especially when evaluating complex responses. Such feedback can lead to misaligned reward signals, potentially causing unintended side effects during the RLHF process. To address these challenges, we explore the use of influence functions to measure the impact of human feedback on the performance of reward models. We propose a compute-efficient approximation method that enables the application of influence functions to LLM-based reward models and large-scale preference datasets. Our experiments showcase two key applications of influence functions: (1) detecting common labeler biases in human feedback datasets and (2) guiding labelers in refining their strategies to better align with expert feedback. By quantifying the impact of human feedback, we believe that influence functions can enhance feedback interpretability and contribute to scalable oversight in RLHF, helping labelers provide more accurate and consistent feedback. Source code is available at https://github.com/mintaywon/IF{\_}RLHF."
}
Markdown (Informal)
[Understanding Impact of Human Feedback via Influence Functions](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1333/) (Min et al., ACL 2025)
ACL
- Taywon Min, Haeone Lee, Yongchan Kwon, and Kimin Lee. 2025. Understanding Impact of Human Feedback via Influence Functions. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 27471–27500, Vienna, Austria. Association for Computational Linguistics.