@inproceedings{chatterjee-das-2026-ju,
title = "{JU}-{NLP}-{PG} at {RAG}4{R}eports 2026: Memory-Efficient Multilingual Report Generation with 4-bit Quantized {LLM}s",
author = "Chatterjee, Swayam and
Das, Dipankar",
editor = "Yang, Eugene and
Lawrie, Dawn and
MacAvaney, Sean and
Mayfield, James and
Soldaini, Luca and
Yates, Andrew",
booktitle = "Proceedings of the 1st Workshop on Multilingual Report Generation via Retrieval Augmented Generation ({RAG}4{R}eports 2026)",
month = jul,
year = "2026",
address = "San Diego, CA, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.rag4reports-1.16/",
pages = "108--112",
ISBN = "979-8-89176-417-0",
abstract = "In the present article, we have described our system developed for participating in Task B on Multilingual Report Generation under RAG4Reports 2026 at ACL 2026 with submitted run ID ju{\_}nlp{\_}pg. The problem statement is given a report request in English, the system retrieves relevant passages from a four million multilingual document corpus (English, Chinese, Russian, Arabic) and generates a grounded, citation-bearing report. Our core challenge was how to fit a large retrieval corpus along with a capable generative model on a two-GPU node with {\ensuremath{\approx}}29 GB RAM. We addressed the challenge employing three different techniques: (1) 4-bit NF4 quantization, shrinking the LLM from {\ensuremath{\approx}}14 GB to {\ensuremath{\approx}}4 GB; (2) memory-mapped, chunked FAISS index construction over pre-computed multilingual-e5-large embeddings; and (3) strict model-loading order to prevent heap fragmentation. On the other hand, the reports are structured around topic nuggets to directly target the Auto-ARGUE evaluation signal."
}Markdown (Informal)
[JU-NLP-PG at RAG4Reports 2026: Memory-Efficient Multilingual Report Generation with 4-bit Quantized LLMs](https://preview.aclanthology.org/ingest-acl-workshops/2026.rag4reports-1.16/) (Chatterjee & Das, RAG4Reports 2026)
ACL