@inproceedings{tumre-etal-2025-improved,
title = "Improved Near-Duplicate Detection for Aggregated and Paywalled News-Feeds",
author = "Tumre, Siddharth and
Patil, Sangameshwar and
Kumar, Alok",
editor = "Chen, Weizhu and
Yang, Yi and
Kachuee, Mohammad and
Fu, Xue-Yong",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-industry.73/",
pages = "979--987",
ISBN = "979-8-89176-194-0",
abstract = "News aggregators play a key role in the rapidly evolving digital landscape by providing comprehensive and timely news stories aggregated from diverse sources into one feed. As these articles are sourced from different outlets, they often end up covering the same underlying event but differ in phrasing, formatting or supplemented with additional details. It is crucial for the news aggregators to identify these near-duplicates, improving the content quality and user engagement by steering away from redundant information. The problem of near-duplicate news detection has become harder with increasing use of paywalls by the news websites resulting in restricted access to the content. It is now common to get only the headline and a short snippet from the article. Previous works have concentrated on full length versions of documents such as webpages. There is very little work that focuses on this variation of the near-duplicate detection problem in which only headline and a small text blurb is available for each news article. We propose Near-Duplicate Detection Using Metadata Augmented Communities (NDD-MAC) approach that combines embeddings from pretrained language model (PLM) and latent metadata of a news article followed by community detection to identify clusters of near-duplicates. We show the efficacy of proposed approach using 2 different real-world datasets. By integrating metadata with community detection, NDD-MAC is able to detect nuanced similarities and differences in news snippets and offers an industrial scale solution for the near-duplicate detection in scenarios with restricted content availability."
}
Markdown (Informal)
[Improved Near-Duplicate Detection for Aggregated and Paywalled News-Feeds](https://preview.aclanthology.org/fix-sig-urls/2025.naacl-industry.73/) (Tumre et al., NAACL 2025)
ACL
- Siddharth Tumre, Sangameshwar Patil, and Alok Kumar. 2025. Improved Near-Duplicate Detection for Aggregated and Paywalled News-Feeds. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: Industry Track), pages 979–987, Albuquerque, New Mexico. Association for Computational Linguistics.