@inproceedings{zhang-etal-2024-jellyfish,
title = "Jellyfish: Instruction-Tuning Local Large Language Models for Data Preprocessing",
author = "Zhang, Haochen and
Dong, Yuyang and
Xiao, Chuan and
Oyamada, Masafumi",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.emnlp-main.497/",
doi = "10.18653/v1/2024.emnlp-main.497",
pages = "8754--8782",
abstract = "This paper explores the utilization of LLMs for data preprocessing (DP), a crucial step in the data mining pipeline that transforms raw data into a clean format. We instruction-tune local LLMs as universal DP task solvers that operate on a local, single, and low-priced GPU, ensuring data security and enabling further customization. We select a collection of datasets across four representative DP tasks and construct instruction data using data configuration, knowledge injection, and reasoning data distillation techniques tailored to DP. By tuning Mistral-7B, Llama 3-8B, and OpenOrca-Platypus2-13B, our models, Jellyfish-7B/8B/13B, deliver competitiveness compared to GPT-3.5/4 models and strong generalizability to unseen tasks while barely compromising the base models' abilities in NLP tasks. Meanwhile, Jellyfish offers enhanced reasoning capabilities compared to GPT-3.5. Our models are available at: https://huggingface.co/NECOUDBFM/JellyfishOur instruction dataset is available at: https://huggingface.co/datasets/NECOUDBFM/Jellyfish-Instruct"
}
Markdown (Informal)
[Jellyfish: Instruction-Tuning Local Large Language Models for Data Preprocessing](https://preview.aclanthology.org/fix-sig-urls/2024.emnlp-main.497/) (Zhang et al., EMNLP 2024)
ACL