@inproceedings{campos-etal-2026-nlp,
title = "{NLP}-based Page Classification for Efficient {LLM} Extraction from {B}razilian Public Tender Documents",
author = "Campos, Pedro and
Medeiros, Ivo de and
Ara{\'u}jo, Adailton de",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 1",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-dnd/2026.propor-1.61/",
pages = "621--627",
ISBN = "979-8-89176-387-6",
abstract = "Extracting structured information from lengthy documents using Large Language Models (LLMs) is computationally expensive and prone to accuracy degradation as input size increases. We present a two-stage pipeline for extracting products from Brazilian tender documents (editais de licita{\c{c}}{\~a}o), combining NLP-based page classification with LLM extraction. We construct a novel dataset of 11,190 annotated pages from 350 documents across five product domains. Our experiments compare transformer-based classifiers (BERTimbau, DistilBERT) with classical machine learning approaches using engineered features. Results show that XGBoost with domain-specific features achieves 97.75{\%} F1-score, outperforming fine-tuned BERT models by over 4 percentage points. The complete pipeline reduces LLM input tokens by 64-88{\%} while maintaining extraction completeness, enabling cost-effective document processing at scale."
}Markdown (Informal)
[NLP-based Page Classification for Efficient LLM Extraction from Brazilian Public Tender Documents](https://preview.aclanthology.org/ingest-dnd/2026.propor-1.61/) (Campos et al., PROPOR 2026)
ACL