@inproceedings{nguyen-etal-2025-read,
title = "What You Read Isn{'}t What You Hear: Linguistic Sensitivity in Deepfake Speech Detection",
author = "Nguyen, Binh and
Shi, Shuju and
Ofman, Ryan and
Le, Thai",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.794/",
pages = "15752--15766",
ISBN = "979-8-89176-332-6",
abstract = "Recent advances in text-to-speech technology have enabled highly realistic voice generation, fueling audio-based deepfake attacks such as fraud and impersonation. While audio anti-spoofing systems are critical for detecting such threats, prior research has predominantly focused on acoustic-level perturbations, leaving **the impact of linguistic variation largely unexplored**. In this paper, we investigate the linguistic sensitivity of both open-source and commercial anti-spoofing detectors by introducing **TAPAS** (Transcript-to-Audio Perturbation Anti-Spoofing), a novel framework for transcript-level adversarial attacks. Our extensive evaluation shows that even minor linguistic perturbations can significantly degrade detection accuracy: attack success rates exceed **60{\%}** on several open-source detector{--}voice pairs, and the accuracy of one commercial detector drops from **100{\%}** on synthetic audio to just **32{\%}**. Through a comprehensive feature attribution analysis, we find that linguistic complexity and model-level audio embedding similarity are key factors contributing to detector vulnerabilities. To illustrate the real-world risks, we replicate a recent Brad Pitt audio deepfake scam and demonstrate that TAPAS can bypass commercial detectors. These findings underscore the **need to move beyond purely acoustic defenses** and incorporate linguistic variation into the design of robust anti-spoofing systems. Our source code is available at https://github.com/nqbinh17/audio{\_}linguistic{\_}adversarial."
}Markdown (Informal)
[What You Read Isn’t What You Hear: Linguistic Sensitivity in Deepfake Speech Detection](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.794/) (Nguyen et al., EMNLP 2025)
ACL