@inproceedings{pan-etal-2026-dart,
title = "{DART}: Mitigating Harm Drift in Difference-Aware {LLM}s via Distill-Audit-Repair Training",
author = "Pan, Ziwen and
Liang, Zihan and
Kabbara, Jad and
Emami, Ali",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.244/",
pages = "4940--4980",
ISBN = "979-8-89176-395-1",
abstract = "Large language models (LLMs) tuned for safety often avoid acknowledging demographic differences, even when such acknowledgment is factually correct (e.g., ancestry-based disease incidence) or contextually justified (e.g., religious hiring preferences). This *identity-blindness* yields incorrect responses, unnecessary refusals, or generic ``equal-treatment'' defaults. We study this via difference-awareness classification: given a question involving demographic groups, the task is not to answer directly, but to classify whether a correct answer requires recognizing group differences (**YES**) or whether groups should be treated identically (**NO**). Crucially, fine-tuning for accuracy triggers *harm drift*: model-generated explanations become increasingly harmful as decision accuracy improves, whether by elaborating harmful content, introducing problematic assumptions, or failing to flag harms the baseline identified. To mitigate this, we introduce **DART** (**D**istill{--}**A**udit{--}**R**epair **T**raining), which distills label-conditioned reasoning from a teacher, audits outputs for harm drift cases relative to baseline, and repairs problematic cases via severity-weighted fine-tuning. On eight benchmarks, DART improves Llama-3-8B-Instruct accuracy from 39.0{\%} to 68.8{\%}, with largest gains on equal-treatment prompts (11.3{\%} {\textrightarrow} 72.6{\%}), while reducing harm drift cases by 72.6{\%}. It also transfers to 280 open-ended real-world queries across medical, legal, policy, and educational domains, improving difference-appropriate responses from 39.8{\%} to 77.5{\%} while reducing refusals from 34.3{\%} to 3.0{\%}. Our results demonstrate that accuracy and safety need not conflict when explicit detection and repair mechanisms are in place."
}