@inproceedings{zhou-etal-2026-selective,
title = "Selective Knowledge Distillation: Fusing {LLM} Semantic Strengths with {DNN} Efficiency for Binary Code Similarity Detection",
author = "Zhou, Shize and
Liu, Peiyu and
Fu, Lirong and
Ye, Tong and
Wang, Wenhai",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1193/",
pages = "26002--26014",
ISBN = "979-8-89176-390-6",
abstract = "Binary Code Similarity Detection (BCSD) plays a vital role in various security applications, including vulnerability identification, malware analysis, and code plagiarism detection. With the growing adoption of deep neural networks (DNNs), substantial progress has been made in recognizing and classifying similar code segments. However, DNN-based BCSD methods often exhibit low accuracy and robustness because they struggle to capture fine-grained and high-level program semantics. In contrast, such semantics are typically captured through natural language interpretations of source code by large language models (LLMs). Yet, LLM-based BCSD methods are constrained by their large model sizes and high inference latency. To alleviate these limitations, this paper proposes BinSKD. The key idea is to leverage an LLM-based BCSD method as the teacher model and transfer its knowledge of high-level program semantics to various DNN-based student models. Specifically, to avoid propagating errors from the teacher to the student, we introduce selective distillation, selecting targets with accurate semantics according to their detection retrieval. In addition, to mitigate the noise introduced by a number of negative samples during distillation, we further propose discrepancy-weighted sampling to focus on the sampleswhere the student{'}s prediction notably deviates from the teacher{'}s. Our experiments show that BinSKD yields Recall@1 improvements of 14.5{\%}{--}91.2{\%} for DNN-based BCSD methods and enables HermesSim to match the teacher{'}s performance with orders-of-magnitude efficiency."
}Markdown (Informal)
[Selective Knowledge Distillation: Fusing LLM Semantic Strengths with DNN Efficiency for Binary Code Similarity Detection](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1193/) (Zhou et al., ACL 2026)
ACL