@inproceedings{sun-etal-2022-bertscore,
title = "{BERTS}core is Unfair: On Social Bias in Language Model-Based Metrics for Text Generation",
author = "Sun, Tianxiang and
He, Junliang and
Qiu, Xipeng and
Huang, Xuanjing",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2022.emnlp-main.245/",
doi = "10.18653/v1/2022.emnlp-main.245",
pages = "3726--3739",
abstract = "Automatic evaluation metrics are crucial to the development of generative systems. In recent years, pre-trained language model (PLM) based metrics, such as BERTScore, have been commonly adopted in various generation tasks. However, it has been demonstrated that PLMs encode a range of stereotypical societal biases, leading to a concern about the fairness of PLMs as metrics. To that end, this work presents the first systematic study on the social bias in PLM-based metrics. We demonstrate that popular PLM-based metrics exhibit significantly higher social bias than traditional metrics on 6 sensitive attributes, namely race, gender, religion, physical appearance, age, and socioeconomic status. In-depth analysis suggests that choosing paradigms (matching, regression, or generation) of the metric has a greater impact on fairness than choosing PLMs. In addition, we develop debiasing adapters that are injected into PLM layers, mitigating bias in PLM-based metrics while retaining high performance for evaluating text generation."
}
Markdown (Informal)
[BERTScore is Unfair: On Social Bias in Language Model-Based Metrics for Text Generation](https://preview.aclanthology.org/add-emnlp-2024-awards/2022.emnlp-main.245/) (Sun et al., EMNLP 2022)
ACL