@inproceedings{li-etal-2026-gre,
title = "{GRE} Score: Generative Risk Evaluation for Large Language Models",
author = "LI, Zaitang and
Chen, Pin-Yu and
Ho, Tsung-Yi",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1202/",
pages = "24007--24033",
ISBN = "979-8-89176-395-1",
abstract = "Large Language Models (LLMs) have revolutionized generative tasks, but concerns about their trustworthiness and vulnerability to adversarial attacks persist. This paper introduces the Generative Robustness Evaluation (GRE) Score, a novel metric designed to assess LLMs' resilience against adversarial red teaming attempts that may compromise model compliance and elicit undesired responses. Our approach utilizes conditional generation for synthetic text creation, offering an attack-independent evaluation of LLM robustness. By calculating the margin in refusal scores, we quantify the robustness of LLMs in an attack-agnostic manner. We evaluate our method on five dimensions with specified datasets, encompassing ethical considerations, safety protocols, and potential misuse scenarios. We present four contributions: (1) The GRE Score framework, which establishes a textual robustness certificate for LLMs against adversarial red teaming attempts, providing a theoretical foundation for quantifying model resilience. (2) Comprehensive evaluations across five dimensions using eight prominent LLMs, validating GRE Scores with adversarial red teaming attacks. Our method demonstrates a consistent ranking of LLM robustness when compared to the attack-based model ranking on TrustLLM (CITATION) while achieving a significant 5-8x speedup compared to traditional evaluation techniques. (3) Insights into the non-linear relationship between model scaling and performance, revealing that larger models do not always perform better, and an analysis of how instruction-tuning impacts robustness across LLMs. (4) The discovery that all evaluated LLMs exhibit lower performance in robustness and privacy tasks compared to other areas, highlighting a critical gap in capabilities."
}Markdown (Informal)
[GRE Score: Generative Risk Evaluation for Large Language Models](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1202/) (LI et al., Findings 2026)
ACL