@inproceedings{song-etal-2026-k,
title = "K-{NLP}ers at {S}em{E}val-2026 Task 7: Multiple {LLM} Agent Debate System for Everyday Knowledge Across Diverse Languages and Cultures",
author = "Song, Jiwoo and
Yeom, Sihyeong and
Kim, Harksoo",
editor = "Kochmar, Ekaterina and
Ghosh, Debanjan and
North, Kai and
Komachi, Mamoru",
booktitle = "Proceedings of the 20th {I}nternational {W}orkshop on {S}emantic {E}valuation (2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.semeval-1.410/",
pages = "3280--3296",
ISBN = "979-8-89176-414-9",
abstract = "This paper presents the K-NLPers system for SemEval-2026 Task 7: Everyday Knowledge Across Diverse Languages and Cultures. The task extends the BLEnD benchmark to evaluate cultural understanding of language models across more than 30 language-country pairs. Although Large Language Models (LLMs) achieve strong overall performance, they exhibit performance disparities across cultural contexts and tend to produce regionally biased responses. To address this limitation, we propose a continent-based multi-agent debate framework that leverages culture-specific performance differences instead of relying on a single model. For the Short Answer Question (SAQ) track, we employ three agents: a general-purpose model, a continent-specific model, and a country-level or culturally adjacent model. These agents engage in independent generation, mutual refinement, and final adjudication. For the Multiple-Choice Question (MCQ) track, we adopt a debate structure centered on high-performing general-purpose models due to the track{'}s simpler structure. Our system participated in all language-region pairs and achieved overall scores of 55.75 on SAQ and 88.32 on MCQ. Further analysis reveals that grouping the performance of various individual models by continent explains performance patterns more consistently than language-based grouping, highlighting the importance of cultural and historical context in model generalization."
}Markdown (Informal)
[K-NLPers at SemEval-2026 Task 7: Multiple LLM Agent Debate System for Everyday Knowledge Across Diverse Languages and Cultures](https://preview.aclanthology.org/ingest-acl-workshops/2026.semeval-1.410/) (Song et al., SemEval 2026)
ACL