% This must be in the first 5 lines to tell arXiv to use pdfLaTeX, which is strongly recommended.
% \pdfoutput=1
% In particular, the hyperref package requires pdfLaTeX in order to break URLs across lines.

\documentclass[11pt]{article}

% Remove the "review" option to generate the final version.
\usepackage{ACL2023}

% Standard package includes
\usepackage{times}
\usepackage{latexsym}
\usepackage{booktabs}
\usepackage{amssymb}
\usepackage{xcolor}
\usepackage{graphicx}
\usepackage{longtable}
\usepackage{array}
\usepackage{amsmath}
\usepackage{tabularx}
\usepackage{multirow}
\usepackage{hyperref}

% For proper rendering and hyphenation of words containing Latin characters (including in bib files)
\usepackage[T1]{fontenc}
% For Vietnamese characters
% \usepackage[T5]{fontenc}
% See https://www.latex-project.org/help/documentation/encguide.pdf for other character sets

% This assumes your files are encoded as UTF8
\usepackage[utf8]{inputenc}

% This is not strictly necessary, and may be commented out.
% However, it will improve the layout of the manuscript,
% and will typically save some space.
\usepackage{microtype}

% This is also not strictly necessary, and may be commented out.
% However, it will improve the aesthetics of text in
% the typewriter font.
\usepackage{inconsolata}


% If the title and author information does not fit in the area allocated, uncomment the following
%
%\setlength\titlebox{<dim>}
%
% and set <dim> to something 5cm or larger.

\title{YNU-HPCC at SemEval-2026 Task 13: Robust Machine-Generated Code Detection under Distribution Shifts}

% Author information can be set in various styles:
% For several authors from the same institution:
% \author{Author 1 \and ... \and Author n \\
%         Address line \\ ... \\ Address line}
% if the names do not fit well on one line use
%         Author 1 \\ {\bf Author 2} \\ ... \\ {\bf Author n} \\
% For authors from different institutions:
% \author{Author 1 \\ Address line \\  ... \\ Address line
%         \And  ... \And
%         Author n \\ Address line \\ ... \\ Address line}
% To start a seperate ``row'' of authors use \AND, as in
% \author{Author 1 \\ Address line \\  ... \\ Address line
%         \AND
%         Author 2 \\ Address line \\ ... \\ Address line \And
%         Author 3 \\ Address line \\ ... \\ Address line}

\author{Lixian Xing, Jin Wang and Xuejie Zhang \\
        School of Information Science and Engineering \\
Yunnan University \\
Kunming, China \\
        xinglixian@stu.ynu.edu.cn, \{wangjin, xjzhang\}@ynu.edu.cn}

\begin{document}
\maketitle
\begin{abstract}
As Large Language Models (LLMs) become prevalent in software development, distinguishing machine-generated from human-written code is increasingly important. This paper describes the system developed by the YNU-HPCC team for SemEval-2026 Task 13, which evaluates detection under cross-language, multi-generator, and hybrid settings. Three modeling paradigms are systematically examined: encoder-based fine-tuning, feature-based machine learning, and task-specific robustness strategies. For Subtask A (Binary Detection), frozen pre-trained encoders and shallow stylometric features exhibit stronger cross-domain robustness than full fine-tuning, with indentation entropy identified as a key discriminative signal. For Subtask B (Multi-Class Attribution), a hierarchical two-stage framework is adopted to decouple human–machine discrimination from fine-grained generator attribution, alleviating severe class imbalance. For Subtask C (Hybrid Detection), a token-level splicing augmentation strategy combined with Supervised Contrastive Learning and Focal Loss is employed to model intra-sample stylistic variation. According to the official leaderboard, our system ranked 12th out of 81 teams in Subtask A, 14th out of 34 in Subtask B, and 8th out of 32 in Subtask C.
\end{abstract}


\section{Introduction}
LLMs are increasingly used for code generation in competitive programming, education, and industrial development \cite{chen_evaluating_2021}. Although these systems improve productivity, they raise concerns regarding authorship attribution, academic integrity, and software reliability. Automatically generated code may introduce subtle logical errors or stylistic artifacts that are difficult to detect manually. Consequently, reliable detection of machine-generated code has become an important research problem. 

SemEval-2026 Task 13\cite{orel-etal-2026-semeval-2026} evaluates detection systems under diverse and heterogeneous conditions \cite{orel_droid_2025, orel_codet-m4_2025}. The task includes three subtasks: (i) binary human vs. machine detection across seen and unseen languages and domains; (ii) multi-class attribution across multiple LLM families under severe class imbalance; and (iii) hybrid and adversarial detection, where human and machine contributions coexist within a single snippet. These settings emphasize robustness to distributional variation rather than purely in-domain accuracy.

Recent studies suggest that performance degradation under unseen languages, generator families, and superficial edits remains substantial even for strong detectors \cite{orel_codet-m4_2025}. Furthermore, code differs from natural language in that many tokens are syntactically constrained and exhibit low entropy, which weakens naïve likelihood-based detection signals. Therefore, the central challenge of machine-generated code detection can be framed as one of distributional robustness—the ability to generalize across languages, domains, generator families, and intra-sample stylistic shifts.

In this paper, three modeling paradigms are systematically investigated in the shared-task setting: encoder-based fine-tuning, feature-based machine learning, and task-specific strategies for handling imbalance and hybrid modeling. For Subtask A, shallow stylometric features outperform deeper semantic representations in cross-domain evaluation. For Subtask B, a hierarchical two-stage framework is introduced to decouple majority-class filtering from fine-grained generator attribution. For Subtask C, a local style inconsistency modeling strategy based on token-level splicing is proposed to enhance robustness to intra-sample distribution shifts.

The remainder of this paper is organized as follows. 
Section~\ref{sec:sec2} reviews related work.
Section~\ref{sec:sec3} presents the proposed methodologies.
Section~\ref{sec:sec4} describes the experimental setup and results.
Section~\ref{sec:sec5} concludes the paper.
Supplementary details are included in the Appendix.



\section{Related Work}
\label{sec:sec2}
Recent work on AI-generated code detection spans supervised transformer-based classifiers, code-aware zero-shot detectors, and increasingly realistic multilingual benchmarks. Early supervised methods such as GPTSniffer showed that pretrained code encoders can distinguish human-written from model-generated code in narrow settings \cite{nguyen_gptsniffer_2024}. Subsequent zero-shot approaches adapted generic machine-generated text detection to code by introducing code-specific probability modeling, perturbation strategies, or rewriting-based similarity signals \cite{xianjun2023zero,shi2025between,ye2025uncovering,ashkenazi2025zero}. More recent benchmark papers broaden the problem to unseen languages, unseen generators, human–AI coauthorship, paraphrase, and adversarial humanization, consistently showing that robust generalization remains challenging \cite{orel2025codet,orel2025droid,guo2025codemirage,orel2026aicd}. 

The multiclass attribution setting of Subtask B is closely related to code stylometry and source-code authorship analysis. Prior work on source-code authorship verification shows that contrastive transformer encoders can learn strong stylometric representations, while recent LLM-oriented studies demonstrate that attribution among generator families is feasible in controlled settings using encoder-style architectures over code tokens and style signals \cite{alvarez2025clave,dipongkor2025reassessing,bisztray2025know}. These findings motivate framing Subtask B not merely as multiclass classification, but as a stylometric attribution problem under imbalance and family-level similarity. 

Pretrained code models offer complementary inductive biases for Task 13. Token-level models such as CodeBERT provide strong baselines, while GraphCodeBERT and UniXcoder incorporate data-flow or AST/comment information that may better support cross-language robustness and mixed-authorship detection \cite{feng_codebert_2020,guo_graphcodebert_2021,guo_unixcoder_2022}. Encoder-decoder and instruction-tuned models such as CodeT5+ and Code Llama further suggest that transfer from broad code pretraining can be useful, although recent detection literature indicates that direct transfer from generic text detectors is often insufficient without code-specific adaptation \cite{wang2023codet5+,roziere_code_2023,xianjun2023zero,shi2025between}.

\section{Methodology}
\label{sec:sec3}
Robust machine-generated code detection under heterogeneous conditions requires generalization across languages, generator families, and intra-sample stylistic shifts. To systematically examine modeling strategies under these constraints, three paradigms are investigated: (1) encoder-based fine-tuning, (2) feature-based classification, and (3) task-specific robustness strategies.

\subsection{Encoder-based Fine-tuning}
Pre-trained code encoders, including UniXcoder\footnote{We use the \texttt{microsoft/unixcoder-base-nine} variant, which extends language coverage to nine programming languages: \url{https://huggingface.co/microsoft/unixcoder-base-nine}.} \cite{guo_unixcoder_2022} and GraphCodeBERT \cite{guo_graphcodebert_2021}, are employed as backbone models. These models are specifically designed for source code representation learning. UniXcoder adopts a unified cross-modal pre-training objective over code, comments, and abstract syntax trees, and can function as an encoder, decoder, or both. GraphCodeBERT incorporates data-flow structure to enhance semantic and structural understanding of code. Given an input snippet $x$, contextualized representations are obtained and fed into a linear classifier.
Two training settings are explored: (i) full fine-tuning of all parameters, and (ii) frozen encoder with only the classification head updated. This paradigm serves as a deep representation baseline under cross-domain evaluation.
\subsection{Feature-based Classification}

To reduce reliance on distribution-specific semantic representations, 
a feature-based paradigm is adopted. Extracted features are fed into 
a LightGBM classifier \cite{ke_lightgbm_2017}.

\paragraph{Shallow Stylometric Features.}
Surface-level statistical and formatting patterns are computed, 
including token counts, lexical ratios, indentation entropy, 
and structural depth indicators. These features aim to capture 
coding habits and layout consistency.

\paragraph{AST Structural Features.}
Source code is parsed with Tree-Sitter\footnote{Tree-Sitter is an open-source 
incremental parsing system for programming tools, capable of generating 
concrete and abstract syntax trees(AST) for multiple programming languages. 
See \url{https://github.com/tree-sitter/tree-sitter}.} 
to obtain structural metrics, including node count, tree depth, 
branching factor, and node-type entropy. These features encode 
syntactic complexity without relying on generator-specific semantics.

\paragraph{Logits-Based Predictability Features.}
Frozen code-oriented LLMs are used as probabilistic scorers. 
For a token sequence $x = (t_1, \dots, t_n)$, token-level probability 
distributions are computed, and summary statistics such as mean 
negative log-likelihood, entropy, margin, and miss rate are derived. 
Specifically, CodeLlama-7B \cite{roziere_code_2023}, 
Llama-3.1-8B \cite{team_llama_2024}, 
CodeQwen-1.5-7B \cite{bai_qwen_2023}, and 
Nxcode-orpo-7B \cite{hong_orpo_2024} are employed following 
the model selection criteria of CoDet-M4. 
Logit-based statistics are extracted independently for each model 
and concatenated to form the final predictability feature vector, 
thereby reducing scorer-specific bias.

Detailed definitions and calculation formulas are provided in \autoref{app:appa}.
\subsection{Hierarchical Two-Stage Framework (Subtask B)}
To address severe class imbalance in Subtask B, a two-stage framework is adopted. Stage 1 performs binary human–machine classification. Stage 2 applies multi-class generator attribution to samples predicted as machine-generated. This decomposition reduces the dominance of the majority class in fine-grained classification.

\subsection{Local Style Inconsistency Modeling (Subtask C)}
Hybrid detection requires modeling intra-sample distribution shifts. To enhance robustness, a data augmentation strategy is introduced to generate synthetic hybrid samples before training.
Specifically, human-written snippet $H$ and machine-generated snippet $M$ are sampled from the same programming language subset to avoid cross-language artifacts. The two sequences are tokenized and truncated, and a synthetic hybrid sample is constructed as:
\begin{equation}
\tilde{X}=\texttt{concat}(H_{1:i}, M_{j:k})
\end{equation}
where concat denotes sequence concatenation.

In addition, Supervised Contrastive Learning \cite{khosla_supervised_2020} is applied to improve representation separability, and Focal Loss \cite{lin_focal_2017} is employed to mitigate class imbalance.

\section{Experimental \& Results}
\label{sec:sec4}
In this section, the proposed modeling paradigms are empirically evaluated on all three subtasks. Experimental settings are first described, followed by detailed performance analysis for binary detection, multi-class attribution, and hybrid detection.
\subsection{Experimental Setup}
\paragraph{Dataset.} 
All datasets are provided by the task organizers, and no external data is used. To mitigate data scarcity in the Hybrid category of Subtask C, a token-level splicing strategy is used to generate synthetic hybrid samples before training. Human-written and machine-generated snippets are sampled within the same programming language subset. For each pair, token sequences are truncated and concatenated according to a randomly sampled split ratio $\alpha \in[0.25,0.75]$, producing a fixed-length sequence of 512 tokens. The concatenation order is randomized to simulate diverse transition patterns. The resulting samples are labeled as Hybrid and added to the training set.

\paragraph{Evaluation Metrics.}
Macro F1-score is adopted as the primary evaluation metric for all subtasks, following the official setting. The metric computes the unweighted mean of per-class F1 scores, treating all classes equally regardless of their frequency. For class i, the F1-score is defined as the harmonic mean of precision and recall. The overall score is computed as:
\begin{equation}
\label{f1}
\text{Macro-F1} = \frac{1}{C} \sum_{i=1}^{C} F_i
\end{equation}

\paragraph{Hyperparameter Setting.}
For all deep learning models, the initial learning rate is set to $1.41e^{-4}$ with a weight decay of $1e^{-2}$. An effective batch size of $96 \times 3$ is used via gradient accumulation. Training is conducted for up to 3 epochs with a linear learning rate scheduler.

The maximum input length is fixed to 512 tokens for fine-tuning. For zero-shot inference and feature extraction of logits from large language models, the maximum length is extended to 1024 tokens.

For the LightGBM classifier, the learning rate is set to $5e^{-3}$, the maximum number of leaves is 32, and the minimum number of data points per leaf is 20. All other hyperparameters are kept at their default values.

\subsection{Subtask A: Binary Machine-Generated Code Detection}
Two encoder-based paradigms were evaluated using UniXcoder-base and GraphCodeBERT-base: full fine-tuning and frozen encoder with a linear classification head. The results are shown in \autoref{tab:tab1}.

Frozen encoders consistently outperform full fine-tuning. UniXcoder improves from 0.43 to 0.47 F1 when the backbone is frozen, and GraphCodeBERT achieves the best encoder-based F1 of 0.51 under this setting. In contrast, full fine-tuning leads to performance degradation.

This trend suggests that updating all encoder parameters may introduce overfitting to training-domain artifacts, reducing generalization to unseen languages and domains. Freezing the backbone preserves pre-trained representations while limiting distribution-specific adaptation, resulting in improved cross-domain robustness.
\begin{table}[h]
\centering
\begin{tabular}{lc}
\toprule
Method & F1 \\
\midrule
UniXcoder & 0.43 \\
UniXcoder(frozen) & 0.47 \\
GraphCodeBERT(frozen) & \textbf{0.51} \\
\bottomrule
\end{tabular}
\caption{Encoder-based method classification results.}
\label{tab:tab1}
\end{table}

\begin{table}[h]
\centering
\begin{tabular}{lccc c}
\toprule
Shallow & AST & Logits & F1 \\
\midrule
 &  & $\checkmark$ & 0.48 \\
 & $\checkmark$ &  & 0.58 \\
 & $\checkmark$ & $\checkmark$ & 0.54 \\
$\checkmark$ &  &  & \textbf{0.69} \\
$\checkmark$ &  & $\checkmark$ & 0.43 \\
$\checkmark$ & $\checkmark$ &  & 0.59 \\
$\checkmark$ & $\checkmark$ & $\checkmark$ & 0.55 \\
\bottomrule
\end{tabular}
\caption{Ablation study results. 
Shallow features contain 21 dimensions, 
AST features contain 10 dimensions, 
and Logits-based features contain 10 dimensions.}
\label{tab:tab2}
\end{table}

\paragraph{Feature-based Machine Learning.}
\autoref{tab:tab2} reports the LightGBM results under different feature combinations. The Shallow feature set alone achieves the best performance (0.69 F1), outperforming both AST (0.58) and Logits (0.48). Notably, feature fusion does not improve performance: combining Shallow with Logits leads to a substantial drop (0.43), and adding AST features reduces F1 to 0.59. These results suggest that deeper structural or model-derived representations do not enhance cross-domain robustness in this setting.

A possible explanation is that the logits features are extracted from only four frozen scorer LLMs, while the test set contains a broader set of generators. Therefore, the resulting statistics reflect relative predictability with respect to selected scorers rather than generator-invariant signals, which may limit generalization. Consequently, subsequent analysis focuses on the Shallow-only configuration.

To further interpret this behavior, SHapley Additive exPlanations (SHAP) \cite{lundberg_unified_2017} is conducted on the Shallow model (\autoref{fig:shap_summary}). Indentation entropy emerges as the most discriminative feature. Higher indentation entropy is associated with machine-generated predictions, indicating more regular and hierarchical formatting patterns. In contrast, human-written code tends to have a flatter, less uniform layout. Similar trends are observed for blank-line ratio and line-length statistics, reinforcing the importance of surface-level stylistic cues.

\begin{figure}[htbp]
\centering
\includegraphics[width=0.5\textwidth]{./figures/shap_summary.pdf}
\caption{SHAP for LightGBM classifier.}
\label{fig:shap_summary}
\end{figure}
\subsection{Subtask B: Multi-Class Authorship Detection}
Subtask B presents severe class imbalance: the Human class contains 442k samples, whereas certain generator classes contain as few as 2k samples. Under such skewed distributions, single-stage multi-class classification tends to bias predictions toward the majority class.

To mitigate this effect, a hierarchical two-stage framework is adopted. In Stage 1, a binary classifier distinguishes Human-written from Machine-generated code. In Stage 2, samples predicted as machine-generated are passed to a fine-grained classifier to identify the generator family (10 classes).

As shown in \autoref{tab:tab4}, the two-stage framework slightly improves performance over the single-stage baseline (0.39 vs. 0.37 F1). The Stage 1 classifier achieves an F1 score of 0.97, effectively separating most human-written samples from those generated by the generator. However, generator attribution in Stage 2 remains challenging due to subtle inter-generator similarities and residual imbalance. These results suggest that hierarchical decoupling alleviates majority bias but does not fully resolve the difficulty of fine-grained attribution.
\begin{table}[h]
\centering
\begin{tabular}{lc}
\toprule
Method & F1 \\
\midrule
GraphCodeBERT & 0.97 \\
\bottomrule
\end{tabular}
\caption{Stage 1 binary classification results on validation.}
\label{tab:tab3}
\end{table}

\begin{table}[h]
\centering
\begin{tabular}{lc}
\toprule
Method & F1 \\
\midrule
Single-stage & 0.37 \\
Two-stage & 0.39 \\
+Focal Loss & 0.39 \\
+MultiTask(lang head) & 0.39 \\
\bottomrule
\end{tabular}
\caption{Two-stage and Single-stage final classification results.}
\label{tab:tab4}
\end{table}

\subsection{Subtask C: Hybrid Code Detection}
Detecting \textit{Hybrid} code (partially machine-generated) and \textit{Adversarial} code presents a unique challenge due to the subtle semantic boundaries between classes. We use UniXcoder as our backbone. We integrated SupCon to enforce tighter clustering of same-class representations in the embedding space, alongside Focal Loss to prioritize hard-to-classify examples. To address the scarcity of hybrid training samples, we implemented a Token-level Splicing strategy. This involves randomly concatenating truncated human and machine code segments during training to simulate diverse transition patterns found in real-world hybrid code.

As shown in \autoref{tab:tab5}, the baseline model achieved an F1 score of 0.57. Incorporating SupCon and Focal Loss resulted in a significant performance boost to 0.60. After applying data augmentation, it improved further to 0.62.
\begin{table}[h]
\centering
\begin{tabular}{lc}
\toprule
Method & F1 \\
\midrule
UniXcoder & 0.57 \\
+SupCon,Focal & 0.60 \\
+Data Augmentation & \textbf{0.62} \\
\bottomrule
\end{tabular}
\caption{Task-C classification results.}
\label{tab:tab5}
\end{table}

\section{Conclusion}
\label{sec:sec5}
This paper presented the YNU-HPCC system for SemEval-2026 Task 13 and systematically evaluated multiple modeling paradigms under heterogeneous detection settings. Across the three subtasks, different forms of distributional variation were encountered, including cross-domain shifts, severe class imbalance, and intra-sample stylistic mixture. Our findings indicate that robustness in machine-generated code detection does not solely depend on increasing representational depth. Instead, shallow stylometric modeling, hierarchical decomposition, and mixed-distribution exposure each play important roles under their respective task constraints. While fine-grained generator attribution remains challenging, especially under extreme imbalance, the proposed strategies consistently mitigate distribution-induced degradation. Future work will explore more principled domain generalization techniques and generator-invariant representations to improve robustness in realistic deployment scenarios further.

\section{Acknowledgments}
This work was supported by the National Natural Science Foundation of China (NSFC) under Grant Nos.61966038 and 62266051. The authors would like to thank the anonymous reviewers for their constructive comments.

% \clearpage
\bibliography{references}
\bibliographystyle{acl_natbib}

\clearpage
\appendix
\section*{Appendix}
\label{sec:appe}
\section{Detailed description of handcrafted features
}
In this appendix, we provide precise definitions and calculation methodologies for the feature sets used in Subtask A.
\label{app:appa}

\subsection{Shallow Features}
Shallow features capture surface-level stylometric patterns and are extracted using regex-based tokenization and simple string analysis. The complete list of shallow features is summarized in \autoref{tab:feature_defs}.

\paragraph{Indentation Entropy Calculation.}
To quantify the consistency of indentation, we calculate the entropy of leading space counts for non-empty lines:
\begin{equation}
H(\texttt{Indent})=-\sum_{i\in I}{p(i)\log{p(i)}}
\end{equation}
where I is the set of unique indentation lengths (e.g., 2 spaces, 4 spaces), and $p(i)$ is the probability (frequency) of indentation length i in the snippet.

\subsection{AST Features}
To enable language-specific AST parsing, we first employ a lightweight Language Identification (LID) model comprising a TF-IDF vectorizer and a Linear Support Vector Machine (LinearSVC). The model achieves 94\% validation accuracy, with misclassifications primarily confined to syntactically similar languages like C and C++. Subsequently, we utilize tree-sitter to parse the code into ASTs. Despite the occasional conflation of C and C++ by the LID model, we empirically observed zero parsing failures. This robustness is attributed to the high degree of grammatical backward compatibility (e.g., a C++ parser successfully processing C code), ensuring that structural metrics remain valid even under dialect misclassification.
After parsing, we traverse the tree to extract structural metrics summarized in \autoref{tab:ast_features}.
\begin{table}[htbp]
\centering
\small
\begin{tabularx}{\linewidth}{@{}l X@{}}
\toprule
Feature Name & Description \\
\midrule

Ast\_total\_nodes & Total number of nodes in the syntax tree. \\

Ast\_unique\_node\_types & Number of distinct node types (e.g., \texttt{function\_definition}, \texttt{identifier}). \\

Ast\_node\_type\_entropy & Shannon entropy of the distribution of node types. \\

Ast\_max\_depth & Maximum depth of the tree from root to leaf. \\

Ast\_avg\_depth & Average depth of all nodes in the tree. \\

Ast\_leaf\_ratio & Ratio of leaf nodes to total nodes. \\

Ast\_branching\_mean & Average number of children per non-leaf node. \\

Ast\_node\_per\_token & Ratio of Ast\_total\_nodes to the shallow token count. \\

Ast\_if\_loop\_count & Counts of structural control nodes (parsed, no regex). \\

Ast\_call\_binary\_expr & Ratio of function calls and binary expressions to total nodes. \\

\bottomrule

\end{tabularx}
\caption{AST-Based Feature Definitions
}
\label{tab:ast_features}
\end{table}
\subsection{Logit Features}
These features are derived from the output probability distributions of a frozen Large Language Model (e.g., CodeLlama-7B). Let the input token sequence be denoted as $x = (x_1, \dots, x_T)$. The detailed definitions of the derived statistics are summarized in \autoref{tab:llm_features}.
\begin{table}[htbp]
\centering
\small
\begin{tabularx}{\linewidth}{@{}l X@{}}
\toprule
Feature Name & Description \\
\midrule

Mean\_true\_logprob 
& Average log-probability of the true token $y_t$ given context $x_{<t}$:
$\frac{1}{T}\sum_{t=1}^{T} \log P(y_t \mid x_{<t})$. \\

Mean\_nll 
& Mean negative log-likelihood:
$-\frac{1}{T}\sum_{t=1}^{T} \log P(y_t \mid x_{<t})$. \\

Std\_true\_logprob 
& Standard deviation of the true-token log-probabilities. \\

Entropy\_topk\_raw 
& Entropy of the Top-$K$ ($K=20$) probability distribution:
$-\sum_{i \in \text{Top-}K} p_i \log p_i$. \\

Entropy\_topk\_norm 
& Entropy of the Top-$K$ distribution normalized by Top-$K$ mass. \\

Maxprob\_mean 
& Average probability of the most likely token:
$\frac{1}{T}\sum_{t=1}^{T} \max_i P(i \mid x_{<t})$. \\

Margin\_mean 
& Average difference between Top-1 and Top-2 probabilities:
$\frac{1}{T}\sum_{t=1}^{T} (p^{(1)}_t - p^{(2)}_t)$. \\

Topk\_mass\_mean 
& Average cumulative probability of the Top-$K$ tokens:
$\frac{1}{T}\sum_{t=1}^{T} \sum_{i \in \text{Top-}K} p_i$. \\

True\_rank\_mean 
& Average rank of the true token $y_t$ in the sorted vocabulary. \\

Miss\_rate 
& Fraction of tokens where the true token $y_t$ is not in the Top-$K$ predictions. \\

\bottomrule
\end{tabularx}
\caption{LLM Probability-Based Feature Definitions}
\label{tab:llm_features}
\end{table}


\begin{table*}[t]
\centering
\small
\setlength{\tabcolsep}{6pt} % 可按需微调
\begin{tabularx}{\textwidth}{@{}l l X@{}}
\toprule
Category & Feature Name & Description \\
\midrule

\multirow{3}{*}{Counts}
& Num\_chars  & Total number of characters in the code. \\
& Num\_lines  & Total number of lines ($N_{\mathrm{lines}}$). \\
& Num\_tokens & Total number of tokens ($N_{\mathrm{tokens}}$) extracted via regex \texttt{\textbackslash w+}. \\

\midrule
\multirow{11}{*}{Ratios}
& Blank\_line\_ratio      & Count of empty lines / $N_{\mathrm{lines}}$. \\
& Comment\_ratio          & Count of comment patterns (//,\#,/*) / $N_{\mathrm{lines}}$. \\
& Numeric\_literal\_ratio  & Count of numeric literals / $N_{\mathrm{tokens}}$. \\
& String\_literal\_ratio   & Count of string literals / $N_{\mathrm{tokens}}$. \\
& Operator\_ratio         & Count of operators (e.g., +, -, *, =) / $N_{\mathrm{tokens}}$. \\
& Keyword\_ratio          & Count of reserved keywords / $N_{\mathrm{tokens}}$. \\
& Vocab\_ratio            & Vocabulary size (unique tokens) / $N_{\mathrm{tokens}}$. \\
& Avg\_line\_length       & $N_{\mathrm{chars}}/N_{\mathrm{lines}}$. \\
& Max\_line\_length       & Length of the longest line in the snippet. \\
& Indent\_entropy         & Shannon entropy of indentation levels (see Eq.~1). \\
& Max\_brace\_depth       & Maximum nesting depth of \{ or \}. \\

\midrule
\multirow{3}{*}{Control Flow}
& Cnt\_if, Cnt\_for, Cnt\_while & Raw counts of \texttt{if}, \texttt{for}, \texttt{while}. \\
& Cnt\_return, Cnt\_break, Cnt\_continue & Raw counts of \texttt{return}, \texttt{break}, \texttt{continue}. \\
& Cnt\_continue & Raw count of \texttt{continue} statements. \\

\bottomrule
\end{tabularx}
\caption{Shallow Feature Definitions}
\label{tab:feature_defs}
\end{table*}


\end{document}
