@inproceedings{lin-etal-2024-dogerm,
title = "{D}oge{RM}: Equipping Reward Models with Domain Knowledge through Model Merging",
author = "Lin, Tzu-Han and
Li, Chen-An and
Lee, Hung-yi and
Chen, Yun-Nung",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.emnlp-main.868/",
doi = "10.18653/v1/2024.emnlp-main.868",
pages = "15506--15524",
abstract = "Reinforcement learning from human feedback (RLHF) is a popular strategy for aligning large language models (LLMs) with desired behaviors. Reward modeling is a crucial step in RLHF. However, collecting paired preference data for training reward models is often costly and time-consuming, especially for domain-specific preferences requiring expert annotation. To address this challenge, we propose the **Do**main knowled**ge** merged **R**eward **M**odel (**DogeRM**), a novel framework that integrates domain-specific knowledge into a general reward model by model merging. The experiments demonstrate that DogeRM enhances performance across different benchmarks and provide a detailed analysis showcasing the effects of model merging, showing the great potential of facilitating model alignment."
}
Markdown (Informal)
[DogeRM: Equipping Reward Models with Domain Knowledge through Model Merging](https://preview.aclanthology.org/fix-sig-urls/2024.emnlp-main.868/) (Lin et al., EMNLP 2024)
ACL