@inproceedings{wang-cai-2026-similar,
title = "Similar Predictions, Different Processes: A Multi-Level Comparison of Human and Multimodal {LLM} Language Prediction",
author = "Wang, Shuqi and
Cai, Zhenguang",
editor = "Bonial, Claire and
Berzak, Yevgeni",
booktitle = "Proceedings of the 30th Conference on Computational Natural Language Learning",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.conll-main.6/",
pages = "70--89",
ISBN = "979-8-89176-410-1",
abstract = "Humans and large language models (LLMs) both generate predictions during language processing, but whether they integrate structural and prosodic cues similarly during visually grounded speech remains underexplored. Multimodal LLMs that jointly process speech and vision now make it possible to compare not only what humans and models predict, but also when predictions emerge. We compared Mandarin speakers and Qwen2.5-Omni-7B on Mandarin dative constructions in a visual world paradigm (VWP), asking how these cues guide predictions about upcoming referents. Experiment 1 used a cloze-in-VWP task to assess offline prediction outputs; Experiment 2 examined online processing via human eye-tracking and a model audio-to-image cross-modal attention measure. In Experiment 1, humans and the model were both sensitive to structure and prosody, consistent with partial output-level alignment, but the model showed a larger structural effect and a condition-specific atypical prosody pattern. In Experiment 2, the time courses diverged: humans showed structural effects before the contrastive connective, whereas the model{'}s sensitivity emerged later, after connective onset. These findings indicate that output-level and process-level alignment can dissociate in this paradigm. This study contributes a methodology for multi-level human{--}model comparison and provides empirical constraints on claims about the cognitive plausibility of multimodal LLMs."
}Markdown (Informal)
[Similar Predictions, Different Processes: A Multi-Level Comparison of Human and Multimodal LLM Language Prediction](https://preview.aclanthology.org/ingest-acl-workshops/2026.conll-main.6/) (Wang & Cai, CoNLL 2026)
ACL