@inproceedings{peng-etal-2025-cuckoo,
title = "Cuckoo: An {IE} Free Rider Hatched by Massive Nutrition in {LLM}{'}s Nest",
author = "Peng, Letian and
Wang, Zilong and
Yao, Feng and
Shang, Jingbo",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.66/",
pages = "1301--1315",
ISBN = "979-8-89176-251-0",
abstract = "Massive high-quality data, both pre-training raw texts and post-training annotations, have been carefully prepared to incubate advanced large language models (LLMs). In contrast, for information extraction (IE), pre-training data, such as BIO-tagged sequences, are hard to scale up. We show that IE models can act as free riders on LLM resources by reframing next-token \textit{prediction} into \textit{extraction} for tokens already present in the context. Specifically, our proposed next tokens extraction (NTE) paradigm learns a versatile IE model, \textit{Cuckoo}, with 102.6M extractive data converted from LLM{'}s pre-training and post-training data. Under the few-shot setting, Cuckoo adapts effectively to traditional and complex instruction-following IE with better performance than existing pre-trained IE models. As a free rider, Cuckoo can naturally evolve with the ongoing advancements in LLM data preparation, benefiting from improvements in LLM training pipelines without additional manual effort."
}
Markdown (Informal)
[Cuckoo: An IE Free Rider Hatched by Massive Nutrition in LLM’s Nest](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.66/) (Peng et al., ACL 2025)
ACL