@inproceedings{shrestha-srinivasan-2025-llm,
title = "{LLM} Bias Detection and Mitigation through the Lens of Desired Distributions",
author = "Shrestha, Ingroj and
Srinivasan, Padmini",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-luhme/2025.emnlp-main.76/",
doi = "10.18653/v1/2025.emnlp-main.76",
pages = "1464--1480",
ISBN = "979-8-89176-332-6",
abstract = "Although prior work on bias mitigation has focused on promoting social equality and demographic parity, less attention has been given to aligning LLM{'}s outputs to desired distributions. For example, we might want to align a model with real-world distributions to support factual grounding. Thus, we define bias as deviation from a desired distribution, which may be an equal or real-world distribution, depending on application goals. We propose a weighted adaptive loss based fine-tuning method that aligns LLM{'}s gender{--}profession output distribution with the desired distribution, while preserving language modeling capability. Using 3 profession sets{---}male-dominated, female-dominated, and gender-balanced{---}derived from U.S. labor statistics (2024), we assess both our adaptive method for reflecting reality and a non-adaptive variant for equality. Across three masked language models, bias is observed under both distributions. We achieve near-complete mitigation under equality and 30{--}75{\%} reduction under real-world settings. Autoregressive LLMs show no bias under equality but notable bias under real-world settings, with the Llama Instruct models (3.2-3B, 3.1-8B) achieving a 50{--}62{\%} reduction."
}Markdown (Informal)
[LLM Bias Detection and Mitigation through the Lens of Desired Distributions](https://preview.aclanthology.org/ingest-luhme/2025.emnlp-main.76/) (Shrestha & Srinivasan, EMNLP 2025)
ACL