@inproceedings{sharma-wang-2026-unified,
title = "A Unified Feature Mixture Framework for Joint Speech and Singing Deepfake Detection",
author = "Sharma, Aastha and
Wang, Guangjing",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1245/",
pages = "24853--24863",
ISBN = "979-8-89176-395-1",
abstract = "High-fidelity audio generation techniques, such as voice conversion and singing voice synthesis, have significantly increased the risk of audio deepfakes. Although existing methods perform well on conversational speech deepfake detection, they fail severely under the speech-to-singing domain shift. To address this limitation, we propose GenuVoice, a unified deepfake detector based on a multi-branch mixture-of-experts architecture that integrates three complementary feature views: Wav2Vec 2.0 representations, log-mel spectrograms, and mel-frequency cepstral coefficients (MFCC). Each expert is trained to remain independently discriminative, while a learned gating network dynamically weights expert contributions. A speech-retentive multi-domain fine-tuning strategy enables adaptation to singing without degrading speech performance. GenuVoice achieves 1.82{\%} Equal Error Rate (EER) on CtrSVDD, compared to 37{--}62{\%} for existing speech-trained detectors, while preserving strong speech performance (0.38{\%} EER on ASVspoof 2019) and generalizing to unseen generators (8.89{\%} EER on held-out ASVspoof 2021). Extensive ablations confirm the importance of multi-expert fusion and speech retention, establishing GenuVoice as an effective unified detector for speech and singing deepfakes. The implementation code is available at \url{https://github.com/aastha-sharma/genuvoice}"
}Markdown (Informal)
[A Unified Feature Mixture Framework for Joint Speech and Singing Deepfake Detection](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1245/) (Sharma & Wang, Findings 2026)
ACL