@inproceedings{rahman-2025-clustering,
title = "Clustering {LLM}-based Word Embeddings to Determine Topics from {B}angla Articles",
author = "Rahman, Rifat",
editor = "Alam, Firoj and
Kar, Sudipta and
Chowdhury, Shammur Absar and
Hassan, Naeemul and
Prince, Enamul Hoque and
Tasnim, Mohiuddin and
Rony, Md Rashad Al Hasan and
Rahman, Md Tahmid Rahman",
booktitle = "Proceedings of the Second Workshop on Bangla Language Processing (BLP-2025)",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.banglalp-1.25/",
pages = "309--321",
ISBN = "979-8-89176-314-2",
abstract = "Topic modeling methods identify fundamental themes within textual documents, facilitating an understanding of the insights inside them. Traditional topic modeling approaches are based on the generative probabilistic process that assumes the document-topic and topic-word distribution. Hence, those approaches fail to capture semantic similarities among words inside the documents and are less scalable with the vast number of topics and documents. This paper presents a method for capturing topics from Bangla documents by clustering the word vectors induced from LLM models. Corpus statistics are integrated into the clustering {\&} word reordering process within each cluster or topic to extract the top words. Additionally, we deploy dimensionality reduction techniques, such as PCA, prior to clustering. Finally, we perform a comparative study and identify the best-performing combination of clustering and word embedding methods. Our top-performing combination outperforms the traditional probabilistic topic model in capturing topics and top words per topic, and excels notably in terms of computational efficiency and time complexity."
}Markdown (Informal)
[Clustering LLM-based Word Embeddings to Determine Topics from Bangla Articles](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.banglalp-1.25/) (Rahman, BanglaLP 2025)
ACL