@inproceedings{zhang-wang-2025-findr,
title = "{FINDR}: A Fast Influential Data Selector for {NL}2{C}ode Pretraining",
author = "Zhang, Xinliang Frederick and
Wang, Lu",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.184/",
pages = "3456--3476",
ISBN = "979-8-89176-298-5",
abstract = "Pretraining on massive corpora has given rise to large language models (LLMs) with multi-task capabilities. However, real-world applications often require more specialized training, as is the case of NL2Code. We approach this specialization through the lens of data selection, i.e., identifying a subset of a large corpus that aligns with a desired target distribution{---}a challenge that remains under-explored within NL2Code. Existing methods are typically designed for selecting instruction-tuning data, and might not easily scale to large-scale code repositories; while methods for NL2Code do exist, they primarily rely on coarse heuristics{---}{--}such as repo stars{---}{--}for filtering. To bridge this gap, we propose FINDR, an efficient data selection method that extends logistic regression with feature-wise importance reweighting{---}marking it, to our knowledge, the first fine-grained solution to NL2Code pretraining. Our method uses hashed n-grams and code-aware features to capture code-specific patterns, and then apply informative priors to reweight feature importance when computing influence scores. Extensive experiments on NL2Python and NL2SQL, with two model families, show that FINDR consistently outperforms strong baselines in both execution accuracy and token efficiency. Notably, pretraining on only 2{\%} of FINDR-selected data boosts Gemma by over 29{\%} in both domains, even surpassing CodeGemma (pretrained on 300x more examples) by 10{\%} in Python."
}Markdown (Informal)
[FINDR: A Fast Influential Data Selector for NL2Code Pretraining](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.184/) (Zhang & Wang, IJCNLP-AACL 2025)
ACL
- Xinliang Frederick Zhang and Lu Wang. 2025. FINDR: A Fast Influential Data Selector for NL2Code Pretraining. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 3456–3476, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.