@inproceedings{zou-etal-2024-promptintern,
title = "{P}rompt{I}ntern: Saving Inference Costs by Internalizing Recurrent Prompt during Large Language Model Fine-tuning",
author = "Zou, Jiaru and
Zhou, Mengyu and
Li, Tao and
Han, Shi and
Zhang, Dongmei",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.findings-emnlp.602/",
doi = "10.18653/v1/2024.findings-emnlp.602",
pages = "10288--10305",
abstract = "Recent advances in fine-tuning large language models (LLMs) have greatly enhanced their usage in domain-specific tasks. Despite the success, fine-tuning continues to rely on repeated and lengthy prompts, which escalate computational expenses, require more resources, and lead to slower inference. In this paper, we present a novel approach, PromptIntern, which internalizes prompt knowledge during model fine-tuning to achieve efficient inference and save costs. Instead of compressing the prompts for a vanilla model, PromptIntern aims to embed the recurrent prompt directly into the model parameters. We design a fine-tuning pipeline that includes instruction template compression, few-shot example absorption, and a progressive internalization strategy, effectively diminishing the need for intricate prompts during inference. Comprehensive experiments on challenging NL2Code tasks demonstrate that our method reduces input tokens by more than 90{\%}, accelerates inference by 4.2 times, and reduces monetary inference costs by 88.3{\%}."
}
Markdown (Informal)
[PromptIntern: Saving Inference Costs by Internalizing Recurrent Prompt during Large Language Model Fine-tuning](https://preview.aclanthology.org/fix-sig-urls/2024.findings-emnlp.602/) (Zou et al., Findings 2024)
ACL