@inproceedings{kodali-manukonda-2024-bytesizedllm,
title = "byte{S}ized{LLM}@{D}ravidian{L}ang{T}ech 2024: Fake News Detection in {D}ravidian Languages - Unleashing the Power of Custom Subword Tokenization with {S}ubword2{V}ec and {B}i{LSTM}",
author = "Kodali, Rohith Gowtham and
Manukonda, Durga Prasad",
editor = "Chakravarthi, Bharathi Raja and
Priyadharshini, Ruba and
Madasamy, Anand Kumar and
Thavareesan, Sajeetha and
Sherly, Elizabeth and
Nadarajan, Rajeswari and
Ravikiran, Manikandan",
booktitle = "Proceedings of the Fourth Workshop on Speech, Vision, and Language Technologies for Dravidian Languages",
month = mar,
year = "2024",
address = "St. Julian's, Malta",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.dravidianlangtech-1.12/",
pages = "79--84",
abstract = "This paper focuses on detecting fake news in resource-constrained languages, particularly Malayalam. We present a novel framework combining subword tokenization, Sanskrit-transliterated Subword2vec embeddings, and a powerful Bidirectional Long Short-Term Memory (BiLSTM) architecture. Despite using only monolingual Malayalam data, our model excelled in the FakeDetect-Malayalam challenge, ranking 4th. The innovative subword tokenizer achieves a remarkable 200x compression ratio, highlighting its efficiency in minimizing model size without compromising accuracy. Our work facilitates resource-efficient deployment in diverse linguistic landscapes and sparks discussion on the potential of multilingual data augmentation. This research provides a promising avenue for mitigating linguistic challenges in the NLP-driven battle against deceptive content."
}
Markdown (Informal)
[byteSizedLLM@DravidianLangTech 2024: Fake News Detection in Dravidian Languages - Unleashing the Power of Custom Subword Tokenization with Subword2Vec and BiLSTM](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.dravidianlangtech-1.12/) (Kodali & Manukonda, DravidianLangTech 2024)
ACL