@inproceedings{ganea-etal-2025-culturally,
title = "A Culturally-Rich {R}omanian {NLP} Dataset from ``Who Wants to Be a Millionaire?'' Videos",
author = "Ganea, Alexandru and
Popovici, Antonia-Adelina and
Dumitran, Marius",
editor = "Angelova, Galia and
Kunilovskaya, Maria and
Escribe, Marie and
Mitkov, Ruslan",
booktitle = "Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://preview.aclanthology.org/corrections-2026-01/2025.ranlp-1.46/",
pages = "378--387",
abstract = "Large Language Models (LLMs) demonstrate varying performance across languages and cultural contexts. This study introduces a novel, culturally-rich, multilingual dataset derived from video recordings of the Romanian game show ``Who Wants to Be a Millionaire?'' (Vrei s{\u{a}} fii Milionar?). We employed an innovative process combining optical character recognition (OCR), automated text extraction, and manual verification to collect question-answer pairs, enriching them with metadata including question domain (e.g., biology, history), cultural relevance (Romanian-specific vs. international), and difficulty. Benchmarking state-of-the-art LLMs, including Romanian-adapted models, on this dataset revealed significant performance disparities: models consistently achieve higher accuracy (80-95{\%}) on international questions compared to Romanian-specific cultural questions (50-75{\%}). We further investigate these differences through experiments involving machine translation of Romanian questions into English and cross-lingual tests using a comparable dataset in French. Our findings underscore the impact of cultural context and data source on LLM performance and offer practical insights for building robust, culturally-aware multilingual NLP systems, especially in educational domains. The dataset is publicly available."
}Markdown (Informal)
[A Culturally-Rich Romanian NLP Dataset from “Who Wants to Be a Millionaire?” Videos](https://preview.aclanthology.org/corrections-2026-01/2025.ranlp-1.46/) (Ganea et al., RANLP 2025)
ACL