@inproceedings{galke-scherp-2022-bag,
title = "Bag-of-Words vs. Graph vs. Sequence in Text Classification: Questioning the Necessity of Text-Graphs and the Surprising Strength of a Wide {MLP}",
author = "Galke, Lukas and
Scherp, Ansgar",
editor = "Muresan, Smaranda and
Nakov, Preslav and
Villavicencio, Aline",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2022.acl-long.279/",
doi = "10.18653/v1/2022.acl-long.279",
pages = "4038--4051",
abstract = "Graph neural networks have triggered a resurgence of graph-based text classification methods, defining today{'}s state of the art. We show that a wide multi-layer perceptron (MLP) using a Bag-of-Words (BoW) outperforms the recent graph-based models TextGCN and HeteGCN in an inductive text classification setting and is comparable with HyperGAT. Moreover, we fine-tune a sequence-based BERT and a lightweight DistilBERT model, which both outperform all state-of-the-art models. These results question the importance of synthetic graphs used in modern text classifiers. In terms of efficiency, DistilBERT is still twice as large as our BoW-based wide MLP, while graph-based models like TextGCN require setting up an $\mathcal{O}(N^2)$ graph, where $N$ is the vocabulary plus corpus size. Finally, since Transformers need to compute $\mathcal{O}(L^2)$ attention weights with sequence length $L$, the MLP models show higher training and inference speeds on datasets with long sequences."
}
Markdown (Informal)
[Bag-of-Words vs. Graph vs. Sequence in Text Classification: Questioning the Necessity of Text-Graphs and the Surprising Strength of a Wide MLP](https://preview.aclanthology.org/fix-sig-urls/2022.acl-long.279/) (Galke & Scherp, ACL 2022)
ACL