@inproceedings{nair-etal-2026-sorry,
title = "``Sorry, Can{'}t Help You'': How Large Language Models Judge Failures to Help Across Languages",
author = "Nair, Pavithra P M and
Gressel, Gilad and
Achuthan, Krishnashree",
editor = "Prabhakaran, Vinodkumar and
Dev, Sunipa and
Benotti, Luciana and
Hershcovich, Daniel and
Cao, Yong and
Zhou, Li and
Ma, BOlei and
Adebara, Ife",
booktitle = "Proceedings of the 4th Workshop on Cross-Cultural Considerations in {NLP} ({C}3{NLP} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.c3nlp-1.13/",
pages = "161--176",
ISBN = "979-8-89176-420-0",
abstract = "Cross-cultural psychology has shown that moral judgments about failures to help vary systematically across cultures. In a landmark study, Miller, Bersoff, and Harwood (1990) found that while Indian and American participants agreed that failures to help are undesirable, they differed in whether they considered helping a moral obligation subject to social sanction or a personal decision. We adapt Miller et al.{'}s paradigm{---}nine scenarios crossing need severity (life-threatening, moderate, minor) with role relationship (parent, friend, stranger) and their original probe questions{---}to a cross-lingual LLM setting, presenting them to four LLMs (GPT-5.4, Claude-Opus-4.6, DeepSeek-V3.1, Qwen3-235B) across ten languages. We find that language significantly shapes how LLMs categorize failures to help as moral violations, social conventions, personal-moral concerns, or personal decisions ($\chi^2(27) = 116.14$, $p < .001$, Cramer{'}s $V = 0.147$). Models agree across languages that failures to help are undesirable, but diverge substantially in how they classify them, with the primary divergence falling between moral violations and personal decisions. The proportion of responses classifying failures as moral violations decreases as need severity decreases and the role relationship becomes more distant. Cross-lingual variation differs substantially across models, with open-weight models showing significantly stronger variation than closed-weight models. These findings indicate that users consulting LLMs in different languages may receive substantively different moral guidance, underscoring the need for cross-lingual normative auditing as a component of multilingual LLM evaluation."
}