@inproceedings{karypis-etal-2024-extending,
title = "Extending Input Contexts of Language Models through Training on Segmented Sequences",
author = "Karypis, Petros and
McAuley, Julian and
Karypis, George",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.findings-naacl.191/",
doi = "10.18653/v1/2024.findings-naacl.191",
pages = "3040--3052",
abstract = "Effectively training language models on longinputs poses many technical challenges. As acost consideration, languages models are pre-trained on a fixed sequence length before beingadapted to longer sequences. We explore var-ious methods for adapting models to longerinputs by training on segmented sequences andan interpolation-based method for extendingabsolute positional embeddings. We developa training procedure to extend the input con-text size of pretrained models with no architec-tural changes and no additional memory coststhan training on the original input lengths. Bysub-sampling segments from long inputs whilemaintaining their original position the model isable to learn new positional interactions. Ourmethod benefits both models trained with abso-lute positional embeddings, by extending theirinput contexts, as well as popular relative posi-tional embedding methods showing a reducedperplexity on sequences longer than they weretrained on. We demonstrate our method canextend input contexts by a factor of 4{\texttimes} whileimproving perplexity."
}
Markdown (Informal)
[Extending Input Contexts of Language Models through Training on Segmented Sequences](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.findings-naacl.191/) (Karypis et al., Findings 2024)
ACL