@article{aguiar-etal-2026-assessing,
title = "Assessing the Difficulty of Inference Types in Natural Language Inference for Clinical Trials",
author = "Aguiar, Mathilde and
Zweigenbaum, Pierre and
Naderi, Nona",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.413/",
pages = "5290--5300",
abstract = "Large Language Models (LLMs) achieve competitive results on Natural Language Inference when applied to clinical trials; however, it is not yet clear which type of inference LLMs perform well or poorly on. We address this by proposing new supplementary annotations for the existing NLI4CT dataset on the types of inferences observed in clinical trials. Our dataset supplements NLI4CT with a total of 1,949 new annotations using our carefully crafted guidelines for 17 types of inferences. To investigate how inference types affect the performance of LLMs, we prompt Flan-T5, Llama, Mistral, and Qwen and evaluate their performance using our newly annotated dataset. We found that logical inferences negatively affect the overall performance of Qwen3-4B, Qwen2.5-7B, and Qwen2.5-14B, whereas numerical inferences negatively affect the performance of Flan-T5-XL and Mixtral. Further analysis shows that MMed-Llama-3 struggles to understand the structure of clinical trial reports. Other parameters, such as the number of inference types involved or the section type in the premise, also influence the performance of the models. Our code and dataset are publicly available."
}Markdown (Informal)
[Assessing the Difficulty of Inference Types in Natural Language Inference for Clinical Trials](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.413/) (Aguiar et al., LREC 2026)
ACL