@inproceedings{pal-etal-2025-indicclaimbuster,
title = "{I}ndic{C}laim{B}uster: A Multilingual Claim Verification Dataset",
author = "Pal, Pritam and
Jana, Shyamal Krishna and
Das, Dipankar",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.133/",
pages = "2478--2489",
ISBN = "979-8-89176-298-5",
abstract = "The present article introduces **IndicClaimBuster**, a novel multilingual claim verification dataset comprising $\approx$ 9K claims and their corresponding evidence in English, Hindi, Bengali, and Hindi-English CodeMixed texts. The data set covers three key domains: politics, law and order, and health, to address the challenges of verifiable facts. Each claim was sourced from reputable Indian news portals and is accompanied by three pieces of evidence, two LLM-generated and one manually curated. Additionally, a separate attempt was conducted to generate refuted claims by employing an LLM. We further develop two frameworks: an unsupervised baseline and a two-stage pipeline that comprises evidence retrieval and veracity prediction modules. For retrieval, we fine-tuned SBERT models, with e5-base demonstrating superior average performance across languages, whereas for veracity prediction, multilingual transformers (mBERT, XLM-R, MuRIL, IndicBERTv2) were fine-tuned. Results indicate MuRIL and IndicBERTv2 excel in Indian languages, while XLM-R performs the best for CodeMix. Our work contributes a high-quality multilingual dataset and strong baseline methodologies, offering valuable resources for advancing automated claim verification in linguistically diverse and low-resource settings for Indian languages. The IndicClaimBuster dataset is available at: https://github.com/pritampal98/indic-claim-buster"
}Markdown (Informal)
[IndicClaimBuster: A Multilingual Claim Verification Dataset](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.133/) (Pal et al., IJCNLP-AACL 2025)
ACL
- Pritam Pal, Shyamal Krishna Jana, and Dipankar Das. 2025. IndicClaimBuster: A Multilingual Claim Verification Dataset. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 2478–2489, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.