@inproceedings{kapil-ekbal-2024-corpus,
title = "A Corpus of {H}indi-{E}nglish Code-Mixed Posts to Hate Speech Detection",
author = "Kapil, Prashant and
Ekbal, Asif",
editor = "Lalitha Devi, Sobha and
Arora, Karunesh",
booktitle = "Proceedings of the 21st International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2024",
address = "AU-KBC Research Centre, Chennai, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.icon-1.9/",
pages = "79--85",
abstract = "Social media content, such as blog posts, comments, and tweets, often contains offensive language, including racial hate speech, personal attacks, and sexual harassment. Detecting inappropriate language is crucial for user safety and prevention of hateful behavior and aggression. This study introduces the HECM (Hindi-English code-mixed tweets) to fill the gap in Hindi language resources. The corpus comprises approximately 9.4K tweets labeled as hateful and nonhateful. It includes detailed information on the data, such as the annotation schema, the label definitions, and an interannotator agreement score of 85{\%}. The study evaluates the effectiveness of traditional machine learning, deep neural networks, and transformer encoder-based approaches. The results show a significant improvement in terms of macro-F1 and weighted F1 scores. Additionally, a lexicon containing 2000 lexicons tagged in 21 categories is created based on the multilingual HURTLEX lexicon. This lexicon is merged with the transformer encoder, resulting in a marginal improvement in macro-F1 and weighted-F1. The study also experiments with a Hindi-Devanagari dataset to assess the impact of the lexicon on performance metrics."
}
Markdown (Informal)
[A Corpus of Hindi-English Code-Mixed Posts to Hate Speech Detection](https://preview.aclanthology.org/fix-sig-urls/2024.icon-1.9/) (Kapil & Ekbal, ICON 2024)
ACL