@inproceedings{alhetelah-ahmad-2026-measuring,
title = "Measuring {LLM}s' Sensitivity to Paraphrased Opinion Prompts",
author = "Alhetelah, Bushra and
Ahmad, Irfan",
editor = "Barnes, Jeremy and
Barriere, Valentin and
De Clercq, Orph{\'e}e and
Klinger, Roman and
Nouri, C{\'e}lia and
Nozza, Debora and
Singh, Pranaydeep",
booktitle = "The Proceedings for the 15th Workshop on Computational Approaches to Subjectivity, Sentiment Social Media Analysis ({WASSA} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.wassa-1.5/",
pages = "52--59",
ISBN = "979-8-89176-378-4",
abstract = "Large language models (LLMs) are now widely used in applications that depend on closed-ended decisions, including automated surveys, policy screening, and decision-support tools. In such contexts, these models are typically expected to produce consistent binary or ternary responses (for example, Yes, No, or Neither) when presented with questions that are semantically equivalent. However recent studies shows that LLM outputs can be influenced by relatively minor changes in prompt wording, raising concerns about the reliability of their decisions under paraphrasing. In this paper, we conduct a systematic analysis of paraphrase robustness across five widely used LLMs. To support this evaluation, we develop a controlled dataset consisting of 200 opinion-based questions drawn from multiple domains, each accompanied by five human-validated paraphrases. All models are evaluated under deterministic inference settings and constrained to a fixed Yes/No/Neither response format. We assess model behavior using a set of complementary metrics that capture the stability of each evaluated model. DeepSeek Reasoner and Gemini 2.0 Flash show the highest stability when responding to paraphrased inputs, whereas Claude 3.7 Sonnet exhibits strong internal consistency but produces judgments that differ more frequently from those of other models. By contrast, GPT-3.5 Turbo and LLaMA 3 70B display greater sensitivity to surface-level variations in prompt phrasing. Overall, these findings suggest that robustness to paraphrasing is driven more by alignment strategies and reasoning design choices than by model size alone."
}Markdown (Informal)
[Measuring LLMs’ Sensitivity to Paraphrased Opinion Prompts](https://preview.aclanthology.org/ingest-eacl/2026.wassa-1.5/) (Alhetelah & Ahmad, WASSA 2026)
ACL
- Bushra Alhetelah and Irfan Ahmad. 2026. Measuring LLMs’ Sensitivity to Paraphrased Opinion Prompts. In The Proceedings for the 15th Workshop on Computational Approaches to Subjectivity, Sentiment Social Media Analysis (WASSA 2026), pages 52–59, Rabat, Morocco. Association for Computational Linguistics.