@inproceedings{tuteja-gonzalez-jucla-2023-long,
title = "Long Text Classification using Transformers with Paragraph Selection Strategies",
author = "Tuteja, Mohit and
Gonz{\'a}lez Jucl{\`a}, Daniel",
editor = "Preoțiuc-Pietro, Daniel and
Goanta, Catalina and
Chalkidis, Ilias and
Barrett, Leslie and
Spanakis, Gerasimos and
Aletras, Nikolaos",
booktitle = "Proceedings of the Natural Legal Language Processing Workshop 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2023.nllp-1.3/",
doi = "10.18653/v1/2023.nllp-1.3",
pages = "17--24",
abstract = "In the legal domain, we often perform classification tasks on very long documents, for example court judgements. These documents often contain thousands of words, so the length of these documents poses a challenge for this modelling task. In this research paper, we present a comprehensive evaluation of various strategies to perform long text classification using Transformers in conjunction with strategies to select document chunks using traditional NLP models. We conduct our experiments on 6 benchmark datasets comprising lengthy documents, 4 of which are publicly available. Each dataset has a median word count exceeding 1,000. Our evaluation encompasses state-of-the-art Transformer models, such as RoBERTa, Longformer, HAT, MEGA and LegalBERT and compares them with a traditional baseline TF-IDF + Neural Network (NN) model. We investigate the effectiveness of pre-training on large corpora, fine tuning strategies, and transfer learning techniques in the context of long text classification."
}
Markdown (Informal)
[Long Text Classification using Transformers with Paragraph Selection Strategies](https://preview.aclanthology.org/fix-sig-urls/2023.nllp-1.3/) (Tuteja & González Juclà, NLLP 2023)
ACL