@inproceedings{pham-etal-2025-cultureinstruct,
title = "{C}ulture{I}nstruct: Curating Multi-Cultural Instructions at Scale",
author = "Pham, Viet Thanh and
Li, Zhuang and
Qu, Lizhen and
Haffari, Gholamreza",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.465/",
pages = "9207--9228",
ISBN = "979-8-89176-189-6",
abstract = "Large language models, despite their remarkable success in recent years, still exhibit severe cultural bias. Therefore, in this paper, we introduce CultureInstruct, a large-scale instruction-tuning dataset designed to reduce cultural bias in LLMs. CultureInstruct is constructed with an automatic pipeline, utilizing public web sources and a specialized LLM to generate instruction. Our data comprises 430K instructions, ranging from classic NLP tasks to complex reasoning. CultureInstruct also covers 11 most relevant topics to cultural knowledge, making it highly diverse. Our experiments show that fine-tuning LLMs with CultureInstruct results in consistent improvements across three types of cultural benchmarks, including (i) general cultural knowledge, (ii) human opinions and values, and (iii) linguistic cultural bias. Our best model, Qwen2-Instruct 72B + CultureInstruct, outperforms GPT-4o Mini and GPT-4o with 18.47{\%} and 13.07{\%} average relative improvements on cultural benchmarks."
}
Markdown (Informal)
[CultureInstruct: Curating Multi-Cultural Instructions at Scale](https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.465/) (Pham et al., NAACL 2025)
ACL
- Viet Thanh Pham, Zhuang Li, Lizhen Qu, and Gholamreza Haffari. 2025. CultureInstruct: Curating Multi-Cultural Instructions at Scale. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 9207–9228, Albuquerque, New Mexico. Association for Computational Linguistics.