@inproceedings{sun-etal-2026-codeevo,
title = "{C}ode{E}vo: Interaction-Driven Synthesis of Code-centric Data through Hybrid and Iterative Feedback",
author = "Sun, Qiushi and
Gong, Jingyang and
Li, Lei and
Guo, Qipeng and
Yuan, Fei",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.438/",
pages = "9668--9687",
ISBN = "979-8-89176-390-6",
abstract = "Acquiring high-quality instruction-code pairs is essential for training Large Language Models for code generation. While automated synthesis has emerged as an alternative to expensive manual curation, current approaches often rely on rigid heuristics, yielding data that is ungrounded or lacks logical complexity. We propose CodeEvo, a dual-agent architecture comprising a Coder for iterative solution synthesis and a Reviewer to orchestrate the generation trajectory. To transcend the limitations of existing heuristics, the Reviewer formulates a Schema to systematically architect logic and complexity through an interleaved synthesis of instructions and code. This process is further reinforced by a hybrid verification protocol synergizing deterministic compiler feedback with semantic evaluation. Under this framework, we construct CodeEvo-100K, a large-scale dataset of instruction{--}code pairs with stepped difficulty levels. Extensive experiments demonstrate that models fine-tuned on CodeEvo data significantly outperform established baselines across code generation benchmarks. In-depth analyses further provide insights into effective code-centric data synthesis. Code and data are available at \url{https://github.com/QiushiSun/CodeEvo}."
}Markdown (Informal)
[CodeEvo: Interaction-Driven Synthesis of Code-centric Data through Hybrid and Iterative Feedback](https://preview.aclanthology.org/ingest-acl/2026.acl-long.438/) (Sun et al., ACL 2026)
ACL