@inproceedings{yeen-etal-2025-manta,
title = "{MANTA}: A Scalable Pipeline for Transmuting Massive Web Corpora into Instruction Datasets",
author = "Yeen, Heuiyeen and
Hong, Seokhee and
Yun, Hyeongu and
Lee, Jinsik",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1019/",
doi = "10.18653/v1/2025.findings-emnlp.1019",
pages = "18755--18770",
ISBN = "979-8-89176-335-7",
abstract = "We introduce MANTA, an automated pipeline that generates high-quality large-scale instruction fine-tuning datasets from massive web corpora while preserving their diversity and scalability. By extracting structured syllabi from web documents and leveraging high-performance LLMs, our approach enables highly effective query-response generation with minimal human intervention. Extensive experiments on 8B-scale LLMs demonstrate that fine-tuning on the MANTA-1M dataset significantly outperforms other massive dataset generation methodologies, particularly in knowledge-intensive tasks such as MMLU and MMLU-Pro, while also delivering superior performance across a broad spectrum of tasks. Moreover, MANTA supports seamless scalability by allowing the continuous integration of web corpus data, enabling expansion into domains requiring intensive knowledge."
}Markdown (Informal)
[MANTA: A Scalable Pipeline for Transmuting Massive Web Corpora into Instruction Datasets](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1019/) (Yeen et al., Findings 2025)
ACL