@inproceedings{agrawal-mamidi-2026-bigger,
title = "Does Bigger Mean Funnier? Evaluating Humor Generation Across the Qwen3 Model Family",
author = "Agrawal, Jatin and
Mamidi, Radhika",
editor = "Amir, Ori and
Hempelmann, Christian F. and
Rayz, Julia and
Dong, Tiansi and
Miller, Tristan",
booktitle = "Proceedings of the 2nd Workshop on Computational Humor ({CH}um 2026)",
month = jul,
year = "2026",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.chum-1.7/",
pages = "81--94",
ISBN = "979-8-89176-431-6",
abstract = "We investigate whether scaling model parameters improves humor generation through a controlled ablation study. Using five Qwen3 variants (8B{--}235B, dense and MoE), we generate jokes across 50 themes. Beyond evaluating humor scaling, this work serves as an empirical study into the nature of LLM versus human evaluations on highly subjective creative tasks. While an automated judge yields a perfect monotonic ranking between parameter count and win rate, human annotators find no significant aggregate difference in humor quality. Restricting to themes where annotators agree reveals a significant preference for the largest model (p = 0.039), suggesting scaling effects exist but are masked by a ``quality floor.'' Crucially, our analysis of bias characteristics shows that the automated judge exhibits severe positional and length biases compared to human evaluators, further suggesting that LLMs may systematically distort quality differences on subjective tasks."
}Markdown (Informal)
[Does Bigger Mean Funnier? Evaluating Humor Generation Across the Qwen3 Model Family](https://preview.aclanthology.org/ingest-acl-workshops/2026.chum-1.7/) (Agrawal & Mamidi, chum 2026)
ACL