@inproceedings{tiwari-2025-extended,
title = "Extended Abstract: Probing-Guided Parameter-Efficient Fine-Tuning for Balancing Linguistic Adaptation and Safety in {LLM}-based Social Influence Systems",
author = "Tiwari, Manyana",
editor = "Hale, James and
Deuksin Kwon, Brian and
Dutt, Ritam",
booktitle = "Proceedings of the Third Workshop on Social Influence in Conversations (SICon 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.sicon-1.12/",
pages = "145--147",
ISBN = "979-8-89176-266-4",
abstract = "Designing effective LLMs for social influence (SI) tasks demands controlling linguistic output such that it adapts to context (such as user attributes, history etc.) while upholding ethical guardrails. Standard Parameter-Efficient Fine-Tuning (PEFT) methods like LoRA struggle to manage the trade-off between adaptive linguistic expression and safety and they optimize based on overall objectives without differentiating the functional roles of internal model components. Therefore, we introduce Probing-Guided PEFT (PG-PEFT), a novel fine-tuning strategy which utilizes interpretability probes to identify LLM components associated with context-driven linguistic variations versus those linked to safety violations (e.g., toxicity, bias). This functional map then guides LoRA updates, enabling more targeted control over the model{'}s linguistic output. We evaluate PG-PEFT on SI tasks (persuasion, negotiation) and linguistic adaptability with safety benchmarks against standard PEFT."
}
Markdown (Informal)
[Extended Abstract: Probing-Guided Parameter-Efficient Fine-Tuning for Balancing Linguistic Adaptation and Safety in LLM-based Social Influence Systems](https://preview.aclanthology.org/landing_page/2025.sicon-1.12/) (Tiwari, SICon 2025)
ACL