@inproceedings{kishi-etal-2026-fake,
title = "Fake News Detection Strategies under Dataset Bias: Using Large-scale Coarse-grained Labels",
author = "Kishi, Yuki and
Arima, Yuji and
Iyatomi, Hitoshi",
editor = "Baez Santamaria, Selene and
Somayajula, Sai Ashish and
Yamaguchi, Atsuki",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 4: Student Research Workshop)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-srw.47/",
pages = "612--621",
ISBN = "979-8-89176-383-8",
abstract = "The spread of misinformation has prompted extensive research on machine-learning{--}based fake news detection. However, existing datasets differ substantially in content distributions and annotation policies, complicating fair evaluation and generalization assessment. We refer to these structural differences as dataset bias. In this study, we quantitatively analyze dataset bias across multiple public fake news datasets (Kaggle, FNN, ISOT, and NELA-GT-2019/2020) with different annotation granularities, including article-level and publisher-level labels. Using document embedding{--}based similarity analysis and article category distributions, we examine how such biases affect detection performance under in-dataset and cross-dataset evaluation settings. Furthermore, to leverage large-scale but coarse-grained publisher-level data, we compare proxy-label training with a semi-supervised learning approach based on Virtual Adversarial Training (VAT). Our results show that detection performance strongly depends on dataset-specific biases, and that proxy-label training and SSL exhibit complementary, and sometimes opposite, strengths depending on whether the evaluation emphasizes in-dataset performance or cross-dataset generalization. These findings highlight the importance of appropriate training strategies and evaluation protocols when using heterogeneous fake news datasets."
}Markdown (Informal)
[Fake News Detection Strategies under Dataset Bias: Using Large-scale Coarse-grained Labels](https://preview.aclanthology.org/ingest-eacl/2026.eacl-srw.47/) (Kishi et al., EACL 2026)
ACL