@inproceedings{she-etal-2026-echovlm,
title = "{E}cho{VLM}: Dynamic Mixture-of-Experts Vision-Language Model for Universal Ultrasound Intelligence",
author = "She, Chaoyin and
Lu, Ruifang and
Chen, Lida and
Wang, Wei and
Huang, Qinghua",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.494/",
pages = "10800--10822",
ISBN = "979-8-89176-390-6",
abstract = "Ultrasound is the preferred early cancer screening modality due to non-ionizing radiation, cost-effectiveness, and real-time imaging, yet conventional diagnosis relies heavily on physician expertise, causing significant subjectivity and limited efficiency. Vision-Language Models (VLMs) show promise but lack ultrasound-specific knowledge and multi-organ generalization. We propose EchoVLM, the first open-source 10-billion-parameter ultrasound-tailored VLM with a Mixture-of-Experts (MoE) architecture. It is infused with knowledge across seven anatomical systems, trained on 208,941 clinical cases, 1.47 million ultrasound key-frame images, and over 100 diseases or imaging findings. Supporting clinical report generation, diagnosis prediction, and Visual Question Answering (VQA), it outperforms Qwen2-VL by 7.58 BLEU-1 and 3.45 ROUGE-1 points in report generation. This work shows substantial potential for establishing a general-purpose ultrasound VLM and lays a technical foundation for clinical translation. Source code and model weights are available at https://github.com/Asunatan/EchoVLM."
}Markdown (Informal)
[EchoVLM: Dynamic Mixture-of-Experts Vision-Language Model for Universal Ultrasound Intelligence](https://preview.aclanthology.org/ingest-acl/2026.acl-long.494/) (She et al., ACL 2026)
ACL