@inproceedings{zhang-etal-2026-towards-radiologist,
title = "Towards a Radiologist Imitation Framework for 3{D} {CT} Diagnosis with Multimodal {LLM}s",
author = "Zhang, Kaidi and
Yan, Zhiyuan and
Cheng, Gao and
Cai, Zhenyang",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.85/",
pages = "1056--1065",
ISBN = "979-8-89176-434-7",
abstract = "Three-dimensional Computed Tomography (3D CT) is a cornerstone of precision medicine. Most AI diagnostic models analyze large num bers of CTslices uniformly, treating all slices as equally important. While this has partly accel erated radiologists{'}workflows, it overlooks that clinically relevant information is often sparsely distributed throughout a volume. Without tar geted or weighted processing, fine-grained cues may be missed and substantial computation wasted on diagnostically uninformative slices. Wepropose aradiologist-simulating framework for selective and efficient 3D CT interpreta tion. Evaluated on a 3D CT dataset covering eight thoracic lesion types, it was compared with state-of-the-art multimodal large language models such as GPT-4o and supervised visual backbones including ViT and ResNet-50. Us ing accuracy, F1-score, AUC, and blind radiolo gist assessment, Screen-CLIP achieved an AUC of 0.87 and F1-score of 0.82, surpassing ViT Base (AUC: 0.84). For report generation, our method outperformed M3D across all metrics, reaching a BLEU-Avg of 29.03, and achieved the highest average Doctors' Score (6.16/10) in a preliminary human evaluation."
}Markdown (Informal)
[Towards a Radiologist Imitation Framework for 3D CT Diagnosis with Multimodal LLMs](https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.85/) (Zhang et al., BioNLP 2026)
ACL