@inproceedings{jain-etal-2025-autochunker,
title = "{A}uto{C}hunker: Structured Text Chunking and its Evaluation",
author = "Jain, Arihant and
Aggarwal, Purav and
Saladi, Anoop",
editor = "Rehm, Georg and
Li, Yunyao",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.acl-industry.69/",
pages = "983--995",
ISBN = "979-8-89176-288-6",
abstract = "Text chunking is fundamental to modern retrieval-augmented systems, yet existing methods often struggle with maintaining semantic coherence, both within and across chunks, while dealing with document structure and noise. We present AutoChunker, a bottom-up approach for text chunking that combines document structure awareness with noise elimination. AutoChunker leverages language models to identify and segregate logical units of information (a chunk) while preserving document hierarchy through a tree-based representation. To evaluate the chunking operator, we introduce a comprehensive evaluation framework based on five core tenets: noise reduction, completeness, context coherence, task relevance, and retrieval performance. Experimental results on Support and Wikipedia articles demonstrate that AutoChunker significantly outperforms existing methods, reducing noise while improving chunk completeness compared to state-of-the-art baselines. When integrated with an online product support system, our approach led to improvements in retrieval performance and customer return rates. Our work not only advances the state of text chunking but also provides a standardized framework for evaluating chunking strategies, addressing a critical gap in the field."
}
Markdown (Informal)
[AutoChunker: Structured Text Chunking and its Evaluation](https://preview.aclanthology.org/landing_page/2025.acl-industry.69/) (Jain et al., ACL 2025)
ACL
- Arihant Jain, Purav Aggarwal, and Anoop Saladi. 2025. AutoChunker: Structured Text Chunking and its Evaluation. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track), pages 983–995, Vienna, Austria. Association for Computational Linguistics.