@inproceedings{kirubakaran-gagnier-2026-bioconflict,
title = "{B}io{C}onflict: A Benchmark for Evaluating Large Language Models in Biomedical Contradiction Detection and Consensus Synthesis",
author = "Kirubakaran, Ashwin and
Gagnier, Henry",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.44/",
pages = "552--558",
ISBN = "979-8-89176-434-7",
abstract = "Resolving contradictions in biomedical literature requires more than factual recall; it demands identifying the hidden variables that explain divergent findings. Existing NLI benchmarks such as MedNLI operate at the sentence level and fail to capture document-level conflicts driven by differences in dosage, cell type, or study design. We introduce BioConflict, a benchmark of 250 expert-annotated paper pairs (500 abstracts) across ten biomedical topics, formalizing three tasks: conflict detection, contextual variable extraction, and consensus synthesis. We evaluate five general-purpose large language models and two domain-specific baselines, finding that general-purpose large language models achieve strong conflict detection (F1 up to 0.89) but exhibit brittle reasoning in synthesis, while domain-specific models lag significantly on all generative tasks. These findings highlight the need for context-aware biomedical AI capable of resolving, not merely retrieving, conflicting scientific evidence."
}Markdown (Informal)
[BioConflict: A Benchmark for Evaluating Large Language Models in Biomedical Contradiction Detection and Consensus Synthesis](https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.44/) (Kirubakaran & Gagnier, BioNLP 2026)
ACL