@inproceedings{wu-song-2025-scaling,
title = "Scaling Context, Not Parameters: Training a Compact 7{B} Language Model for Efficient Long-Context Processing",
author = "Wu, Chen and
Song, Yin",
editor = "Rehm, Georg and
Li, Yunyao",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-industry.6/",
pages = "61--68",
ISBN = "979-8-89176-288-6",
abstract = "We present MegaBeam-Mistral-7B, a language model that supports 512K-token context length. Our work addresses practical limitations in long-context training, supporting real-world tasks such as compliance monitoring and verification. Evaluated on three long-context benchmarks, our 7B-parameter model demonstrates superior in-context learning performance on HELMET and robust retrieval and tracing capability on RULER. It is currently the only open model to achieve competitive long-range reasoning on BABILong at 512K context length without RAG or targeted fine-tuning. Released as fully open source under the Apache 2.0 license, the model has been downloaded over 100,000 times on Hugging Face."
}
Markdown (Informal)
[Scaling Context, Not Parameters: Training a Compact 7B Language Model for Efficient Long-Context Processing](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-industry.6/) (Wu & Song, ACL 2025)
ACL