@inproceedings{etori-gini-2024-rideke,
title = "{R}ide{KE}: Leveraging Low-resource {T}witter User-generated Content for Sentiment and Emotion Detection on Code-switched {RHS} Dataset.",
author = "Etori, Naome and
Gini, Maria",
editor = "De Clercq, Orph{\'e}e and
Barriere, Valentin and
Barnes, Jeremy and
Klinger, Roman and
Sedoc, Jo{\~a}o and
Tafreshi, Shabnam",
booktitle = "Proceedings of the 14th Workshop on Computational Approaches to Subjectivity, Sentiment, {\&} Social Media Analysis",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.wassa-1.19/",
doi = "10.18653/v1/2024.wassa-1.19",
pages = "234--249",
abstract = "Social media has become a crucial open-access platform enabling individuals to freely express opinions and share experiences. These platforms contain user-generated content facilitating instantaneous communication and feedback. However, leveraging low-resource language data from Twitter can be challenging due to the scarcity and poor quality of content with significant variations in language use, such as slang and code-switching. Automatically identifying tweets in low-resource languages can also be challenging because Twitter primarily supports high-resource languages; low-resource languages often lack robust linguistic and contextual support. This paper analyzes Kenyan code-switched data from Twitter using four transformer-based pretrained models for sentiment and emotion classification tasks using supervised and semi-supervised methods. We detail the methodology behind data collection, the annotation procedure, and the challenges encountered during the data curation phase. Our results show that XLM-R outperforms other models; for sentiment analysis, XLM-R supervised model achieves the highest accuracy (69.2{\%}) and F1 score (66.1{\%}), XLM-R semi-supervised (67.2{\%} accuracy, 64.1{\%} F1 score). In emotion analysis, DistilBERT supervised leads in accuracy (59.8{\%}) and F1 score (31{\%}), mBERT semi-supervised (accuracy (59{\%} and F1 score 26.5{\%}). AfriBERTa models show the lowest accuracy and F1 scores. This indicates that the semi-supervised method`s performance is constrained by the small labeled dataset."
}
Markdown (Informal)
[RideKE: Leveraging Low-resource Twitter User-generated Content for Sentiment and Emotion Detection on Code-switched RHS Dataset.](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.wassa-1.19/) (Etori & Gini, WASSA 2024)
ACL