@inproceedings{yoo-2026-demographic,
title = "When Demographic Sensitivity Isn{'}t What It Seems: Baseline-Aware Counterfactual Audits for Clinical {NLP}",
author = "Yoo, Hyunwoo",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.14/",
pages = "141--155",
ISBN = "979-8-89176-434-7",
abstract = "Clinical NLP systems are increasingly used for triage support, prediction, and decision assistance in EHR-based settings, where demographic fairness is a critical concern. A common evaluation approach is counterfactual demographic perturbation: modifying attributes such as age or sex while holding clinical evidence fixed and measuring output changes. However, we show that such counterfactual audits can be misleading when interpreted in isolation. Across three clinical LLMs, we find that non-demographic control perturbations (e.g., paraphrases) often induce output variability comparable to or greater than demographic edits. This can contribute to overestimation or misinterpretation of demographic bias.To address this, we propose a baseline-aware audit framework that explicitly compares demographic perturbations against control baselines. Our analysis reveals that (i) label-level stability can mask substantial variation in generated rationales and recommendations, and (ii) age-based perturbations generally induce larger effects than sex-based ones in borderline cases. Crucially, we identify a high intrinsic instability ({''}noise floor''; 0.46{--}0.71 Jaccard instability) in clinical LLM generations, while additional matched-metric analyses show that demographic perturbations are often comparable to non-demographic baseline variability.These findings highlight a key limitation of existing fairness evaluations: without establishing appropriate baselines, apparent demographic sensitivity may be over- or mis-attributed to bias rather than broader generative instability. We argue that baseline-aware counterfactual audits, which explicitly compare demographic effects against intrinsic model noise, provide a more reliable lens for evaluating clinical NLP systems in high-stakes settings."
}Markdown (Informal)
[When Demographic Sensitivity Isn’t What It Seems: Baseline-Aware Counterfactual Audits for Clinical NLP](https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.14/) (Yoo, BioNLP 2026)
ACL