@inproceedings{kumar-2026-small,
title = "Small {LLM}s for Biomedical Claim Verification: Cost-Effective Fine-Tuning, Structural Dataset Shortcuts, and Cross-Domain Generalization",
author = "Kumar, Gaurav",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.57/",
pages = "704--712",
ISBN = "979-8-89176-434-7",
abstract = "Large Language Models such as GPT-4o and GPT-5 achieve strong zero-shot performance on biomedical claim verification, but cost and opacity limit scalable use. We fine-tune three small LLMs; Phi-3-mini (3.8B), Qwen2.5-3B, and Mistral-7B; via QLoRA on SciFact and HealthVer, providing the first study of QLoRA models against GPT-4o and fine-tuned BioLinkBERT encoders. Mistral-7B QLoRA achieves higher F1 than both GPT-4o and GPT-5 (up to 12{\%} gain) at 44.5x lower cost using just 1,008 training examples, representing a compelling cost-quality trade-off. We conduct extensive in-domain and cross-domain evaluation: models trained on SciFact tested on HealthVer and vice versa, at matched sizes to isolate dataset structure from data quantity. We identify a previously unreported structural artifact in SciFact that inflates in-domain scores, and show through bidirectional out-of-domain evaluation that training on structurally sound data enables robust cross-domain transfer. We plan to release all code and adapter checkpoints."
}Markdown (Informal)
[Small LLMs for Biomedical Claim Verification: Cost-Effective Fine-Tuning, Structural Dataset Shortcuts, and Cross-Domain Generalization](https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.57/) (Kumar, BioNLP 2026)
ACL