@inproceedings{barbaresi-2021-trafilatura,
title = "Trafilatura: {A} Web Scraping Library and Command-Line Tool for Text Discovery and Extraction",
author = "Barbaresi, Adrien",
editor = "Ji, Heng and
Park, Jong C. and
Xia, Rui",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: System Demonstrations",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2021.acl-demo.15/",
doi = "10.18653/v1/2021.acl-demo.15",
pages = "122--131",
abstract = "An essential operation in web corpus construction consists in retaining the desired content while discarding the rest. Another challenge finding one`s way through websites. This article introduces a text discovery and extraction tool published under open-source license. Its installation and use is straightforward, notably from Python and on the command-line. The software allows for main text, comments and metadata extraction, while also providing building blocks for web crawling tasks. A comparative evaluation on real-world data also shows its interest as well as the performance of other available solutions. The contributions of this paper are threefold: it references the software, features a benchmark, and provides a meaningful baseline for similar tasks. The tool performs significantly better than other open-source solutions in this evaluation and in external benchmarks."
}
Markdown (Informal)
[Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction](https://preview.aclanthology.org/jlcl-multiple-ingestion/2021.acl-demo.15/) (Barbaresi, ACL-IJCNLP 2021)
ACL