@inproceedings{nguyen-etal-2026-vilegallm,
title = "{V}i{L}egal{LM}: Language Models for {V}ietnamese Legal Text",
author = "Nguyen, Truong-Phuc and
Nguyen, Quy-Nhan and
Nguyen, Minh-Tien",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.findings-acl.1801/",
pages = "36136--36150",
ISBN = "979-8-89176-395-1",
abstract = "We present **ViLegalLM**, comprising **ViLegalBERT** and **ViLegalQwen**, the first suite of Vietnamese pretrained language models for legal text understanding and generation. It includes one encoder-only model (ViLegalBERT, 135M parameters) and two decoder-only models (ViLegalQwen2.5-1.5B-Base and ViLegalQwen3-1.7B-Base), all continually pretrained on a newly curated 16GB Vietnamese legal corpus, significantly larger than previous work. To mitigate data scarcity, we construct three synthetic datasets using LLM-based generation and hard negative mining for True/False QA, Multiple Choice QA, and Natural Language Inference. We establish state-of-the-art results among open-source models on four main Vietnamese legal downstream tasks spanning ten benchmarks, demonstrating that continual pretraining from base models consistently outperforms instruction-tuned adaptation. Source codes, corpus, datasets, and model checkpoints are publicly available at https://github.com/ntphuc149/ViLegalLM."
}Markdown (Informal)
[ViLegalLM: Language Models for Vietnamese Legal Text](https://preview.aclanthology.org/ingest-acl-workshops/2026.findings-acl.1801/) (Nguyen et al., Findings 2026)
ACL
- Truong-Phuc Nguyen, Quy-Nhan Nguyen, and Minh-Tien Nguyen. 2026. ViLegalLM: Language Models for Vietnamese Legal Text. In Findings of the Association for Computational Linguistics: ACL 2026, pages 36136–36150, San Diego, California, United States. Association for Computational Linguistics.