\documentclass[11pt]{article}

% Change "review" to "final" to generate the final (camera-ready) version.
\usepackage[final]{acl}

% Standard package includes
\usepackage{times}
\usepackage{latexsym}
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{microtype}
\usepackage{inconsolata}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{booktabs}
\usepackage{algorithm}
\usepackage{algpseudocode}

\title{Team HausaNLP at SemEval-2026 Task 9: Tackling Class Imbalance in Low-Resource Hausa Polarization Detection}

\author{Faisal Muhammad Adam \\
  ACETEL, National Open University of Nigeria \\
  \texttt{faisaladamm@gmail.com} \\\And
  Lukman Aliyu Jubrin\\
  HausaNLP \\
  \texttt{lukman.j.aliyu@gmail.com} \\\AND
  Sani Aji \\
  Department of Mathematics, Faculty of Science,\\
  Gombe State University, Gombe, Nigeria\\
  \texttt{ajysani@yahoo.com} \\\And
  Abdulhamid Abubakar\\
  Nasarawa State University, Keffi\\
  \texttt{abdulhamid@ab-bkr.com} \\}

\begin{document}
\maketitle

\begin{abstract}
This paper describes our submission to SemEval-2026 Task 9, Subtask 2 (Hausa). The task involves identifying specific categories of polarization (Political, Religious, Ethnic, etc.) in Hausa social media comments. The dataset presented significant challenges, primarily extreme class imbalance and the low-resource nature of the language. Our system uses a pre-trained multilingual transformer (Afro-XLMR-Large) fine-tuned with Weighted Binary Cross-Entropy loss and dynamic undersampling (1:3 ratio) to mitigate the scarcity of polarized examples. On the official test set, our system achieved an official Macro-F1 score of 0.2346 and a Micro-F1 score of 0.2581. Our model is recall-oriented (Micro-Recall: 0.6166), demonstrating strong capability in detecting polarization, though precision remains a challenge (0.1632). We achieved our best per-class performance in the Political domain (F1: 0.48).
\end{abstract}

\section{Introduction}
Social media polarization is a growing concern, particularly in the Global South where automated moderation tools are scarce. Subtask 2 focuses on classifying polarization types in Hausa, a Chadic language spoken by millions in West Africa. The task is built on the POLAR benchmark for multilingual, multicultural, and multi-event online polarization \citep{naseem2026polarbenchmarkmultilingualmulticultural}. The primary difficulty of this task was the distribution of labels: the vast majority of training data was non-polarized, making standard classifiers biased toward the ``negative'' (non-polarized) class. We follow the official task definition and scoring protocol provided by the SemEval-2026 Task 9 organizers \citep{naseem-etal-2026-polar}.

Our approach prioritized recall (finding polarized content) over precision. We hypothesized that for a content moderation task, missing a polarized comment (false negative) is significantly worse than flagging a neutral one (false positive).

%Social media polarization is a growing concern, particularly in the Global South where automated moderation tools are scarce. Subtask 2 focuses on classifying polarization types in Hausa, a Chadic language spoken by millions in West Africa. The primary difficulty of this task was the distribution of labels: the vast majority of training data was non-polarized, making standard classifiers biased toward the ``negative'' (non-polarized) class. We follow the official task definition and scoring protocol provided by the SemEval-2026 Task 9 organizers \citep{semeval2026task9}. 

%Our approach prioritized recall (finding polarized content) over precision. We hypothesized that for a content moderation task, missing a polarized comment (false negative) is significantly worse than flagging a neutral one (false positive). 

\section{Related Work}
\subsection{Polarization Detection and Text Classification}

Polarization detection in social media has gained increasing attention due to its societal impact, particularly in moderating harmful and divisive content. Prior work has approached this problem using supervised text classification methods, ranging from traditional machine learning models to deep neural architectures. Recent advances have demonstrated the effectiveness of transformer-based models such as BERT \citep{devlin2019bert} and its multilingual variants for capturing contextual semantics in social media text. In the context of shared tasks such as SemEval, transformer-based approaches have consistently outperformed traditional methods due to their ability to model complex linguistic patterns and contextual dependencies.

\subsection{Multilingual and Low-Resource Language Models}

Detecting harmful and polarized content has been extensively studied in high-resource languages \citep{waseem-hovy-2016-hateful, garimella2018quantifying}. However, transferring these capabilities to low-resource African languages like Hausa introduces unique challenges due to morphological complexity and data scarcity \citep{adelani-etal-2022-masakhaner}. 

Handling low-resource languages remains a major challenge in natural language processing. Multilingual pre-trained language models such as XLM-RoBERTa \citep{conneau2020} have shown strong cross-lingual transfer capabilities, enabling effective performance even with limited labeled data. More recently, models specifically adapted for African languages, such as Afro-XLMR \citep{alabi2022}, have been proposed to better capture linguistic characteristics unique to these languages. These models leverage multilingual adaptive fine-tuning to improve representation quality for underrepresented languages such as Hausa. Our work builds upon this line of research by employing Afro-XLMR-Large as the backbone model for polarization detection.

\subsection{Class Imbalance in Text Classification}

Class imbalance is a well-known issue in text classification tasks, particularly in real-world datasets where certain classes are underrepresented. Standard models tend to be biased toward majority classes, leading to poor performance on minority categories. Various strategies have been proposed to address this issue, including data-level methods such as undersampling and oversampling \citep{chawla2002smote, he2009learning}, as well as algorithm-level approaches such as cost-sensitive learning and weighted loss functions \citep{lin2017focal}. In transformer-based settings, weighted binary cross-entropy loss is commonly used to assign higher importance to minority classes, thereby improving recall for rare labels.

Recent studies have also explored dynamic sampling strategies to balance class distributions during training. These approaches construct mini-batches with controlled class ratios, preventing the model from learning skewed class priors. Our approach combines dynamic undersampling with a weighted loss function to effectively mitigate the impact of extreme class imbalance in the SemEval-2026 Task 9 dataset.

%Detecting harmful and polarized content has been extensively studied in high-resource languages \citep{waseem-hovy-2016-hateful, garimella2018quantifying}. However, transferring these capabilities to African languages like Hausa introduces unique challenges due to morphological complexity and data scarcity \citep{adelani-etal-2022-masakhaner}. 

%To bridge this gap, multilingual pre-trained language models have become the standard. Specifically, XLM-RoBERTa \citep{conneau2020} has shown strong zero-shot and few-shot capabilities. More recently, \citep{alabi2022} introduced Afro-XLMR, which adapts XLM-R specifically to African languages via multilingual adaptive fine-tuning, significantly improving performance on downstream tasks for languages like Hausa. Furthermore, handling extreme class imbalance in text classification is a well-documented challenge. Traditional methods involve oversampling minority classes (e.g., SMOTE) or undersampling majority classes \citep{chawla2002smote}. In this work, we combine Afro-XLMR with dynamic undersampling and weighted loss optimization to address the specific label skew of the POLAR dataset.


\section{Methodology}
Our system is built on the Afro-XLMR-Large encoder and is designed specifically for the low-resource, highly imbalanced setting of Hausa polarization detection. In this section, we describe the model architecture, our two-stage imbalance mitigation strategy, and the training configuration used for the final submission.

\subsection{Model Architecture}
We employed Afro-XLMR-Large \citep{alabi2022}, a variant of XLM-RoBERTa pre-trained on 17 African languages, including Hausa. We selected the ``Large'' variant rather than the ``Base'' model to benefit from its higher representational capacity and stronger contextual modeling, which are particularly important for distinguishing neutral religious expressions (e.g., prayers) from genuinely polarized or sectarian content.

We fine-tuned the model for multi-label classification, where each input text may belong to one or more polarization categories. A sigmoid activation function was applied at the output layer to produce independent probabilities for each label.

\subsection{Handling Class Imbalance}
The dataset is heavily imbalanced, with most instances belonging to the non-polarized class. To address this challenge, we adopted a hybrid strategy that combines data-level sampling with an algorithm-level loss adjustment.

\textbf{1:3 Dynamic Undersampling.} Given the scarcity of polarized examples, we used dynamic undersampling to control the class distribution during training. In each epoch, the majority class is subsampled to maintain a 1:3 ratio between polarized and non-polarized instances. Polarized examples are retained first, after which a subset of non-polarized examples is randomly drawn to preserve the target ratio. Repeating this procedure at every epoch exposes the model to different portions of the majority class over time, thereby reducing bias toward the dominant class without overfitting to a single reduced subset.

\begin{algorithm}
\small
\caption{Per-epoch dynamic undersampling}
\label{alg:undersample}
\begin{algorithmic}[1]
\State \textbf{Input:} training set $D$, ratio $r = 3$
\State $P \gets \{x \in D : \exists c,\, y_c(x) = 1\}$ \Comment{polarized}
\State $N \gets D \setminus P$ \Comment{non-polarized}
\For{each epoch}
  \State $N' \gets$ sample $r \cdot |P|$ from $N$ without replacement
  \State $D_{\text{epoch}} \gets \text{shuffle}(P \cup N')$
  \State train one epoch over $D_{\text{epoch}}$
\EndFor
\end{algorithmic}
\end{algorithm}

\textbf{Weighted Loss Function.} To further reduce the risk of ``zero-shot'' failure on rare classes, we optimized the model using Weighted Binary Cross-Entropy (WBCE). For a sample $i$, the loss is:

\begin{equation}
    \mathcal{L}_i = - \left[w \cdot y_i \log(p_i) + (1 - y_i) \log(1 - p_i)\right]
\end{equation}

where $y_i \in \{0,1\}$ is the ground-truth label, $p_i$ is the predicted probability for the positive class, and $w = 3.0$ is the weight assigned to the positive polarized class. This weighting increases the penalty for false negatives and improves the model's sensitivity to rare polarized examples.

\subsection{Training Procedure}

We fine-tuned the model using the Hugging Face Transformers framework. Input sequences were tokenized with the Afro-XLMR tokenizer and truncated to a maximum length of 128 tokens.

The model was trained with the AdamW optimizer using a learning rate of $2 \times 10^{-5}$. Due to GPU memory constraints, we used a batch size of 4 with gradient accumulation over 4 steps, yielding an effective batch size of 16. Training was conducted for 6 epochs on an NVIDIA T4 GPU.
\subsection{Model Selection and Ablation}

We conducted a series of experiments to evaluate the impact of different design choices.

First, we compared Afro-XLMR-Base and Afro-XLMR-Large under identical training conditions. The Large model achieved a higher Macro-F1 score on the development set (+2.1), indicating better contextual modeling.

Second, we evaluated training on the original imbalanced dataset versus dynamic undersampling. While undersampling improved recall for minority classes (+6.8), it also introduced more false positives, highlighting the trade-off between precision and recall.

Third, we compared standard binary cross-entropy with the weighted variant. The weighted loss improved performance on rare classes such as Gender/Sexual (+4.3 F1), demonstrating its effectiveness in handling label imbalance.

Based on these findings, we selected Afro-XLMR-Large with dynamic undersampling and weighted loss as our final system. We submitted three runs during the evaluation phase and report the best-performing run.
%\subsection{System Variants and Final Selection}

%%We tested multiple variants before selecting our final system. First, we compared Afro-XLMR-Base and Afro-XLMR-Large under the same optimization settings; the Large model yielded stronger development Macro-F1 (+2.1 points) and was retained. Second, we compared training on the original imbalanced data versus dynamic undersampling. Undersampling improved minority-class recall (+6.8 points) but increased false positives. Third, we compared unweighted BCE and weighted BCE; weighted BCE improved detection of rare labels such as Gender/Sexual (+4.3 F1 points for that class on development data). Based on this ablation process, we selected Afro-XLMR-Large with weighted BCE and 1:3 undersampling as our official submission. We submitted three runs to CodaBench during the evaluation period and reported the best-scoring run as the final system.
%\section{Experimental Setup}

%We implemented our system using the Hugging Face Transformers library.

%\begin{itemize}
%    \item Tokenizer: Afro-XLMR-Large Tokenizer (Max Length: 128).
 %   \item Optimizer: AdamW with learning rate 2e-5.
 %   \item Batch Size: 4 (with Gradient Accumulation = 4).
  %  \item Epochs: 6.
%    \item Hardware: NVIDIA T4 GPU.
%\end{itemize}

\section{Results and Discussion}

\subsection{Overall Performance}

In this section, we present our official results on the test set. Since the official Subtask 2 ranking metric
is Macro-F1, we report it first in Table~\ref{tab:results} and treat Micro-F1,
recall, and precision as supporting diagnostics.

\begin{table}[h]
\centering
\begin{tabular}{lc}
\hline
Metric & Score \\
\hline
F1 Macro (Ours) & 0.2346 \\
F1 Macro (Baseline) & 0.2160 \\
F1 Micro & 0.2581 \\
Recall Micro & 0.6166 \\
Precision Micro & 0.1632 \\
\hline
\end{tabular}
\caption{Comparison between our system and the baseline reported by the task organizers on Subtask 2 (Hausa).}
\label{tab:results}
\end{table}

Our system achieved a Macro-F1 score of 0.2346, outperforming the baseline reported by the task organizers (0.216). This demonstrates that our imbalance-aware training strategy improves over a standard baseline in a low-resource setting.

Our model exhibits high recall (0.6166) but low precision (0.1632), indicating a tendency to over-predict polarization. This is consistent with our design objective, since in a moderation context, missing harmful content (false negatives) is often more critical than incorrectly flagging neutral content (false positives).

Figure~\ref{fig:results-overview} presents an additional visual summary of our overall system results.

\begin{figure}[t]
\centering
\includegraphics[width=\linewidth]{uploads/figure1.png}
\caption{Visual summary of the overall experimental results for our Hausa polarization detection system.}
\label{fig:results-overview}
\end{figure}

\subsection{Comparison with Leaderboard Systems}

In addition to the baseline comparison, we evaluate our performance relative to the leaderboard submissions. On the leaderboard, our submission is ranked 16th out of 22.

While this places our result in the lower-middle range, it reflects the overall difficulty of the task, particularly given the low-resource nature of Hausa and the severe class imbalance.
%The top-performing system achieved a Macro-F1 score of 0.4796, highlighting a substantial performance gap between current approaches and state-of-the-art systems.

This shows that more advanced techniques such as improved representations, task-specific fine-tuning strategies, or additional data augmentation are necessary to achieve top performance. Nevertheless, our results show that addressing class imbalance can lead to improved performance over the baseline, even if it does not yet match the best-performing system.

\subsection{Per-Class Performance}

We further analyze performance on individual polarization categories. Table~\ref{tab:per-class-results} summarizes the precision, recall, and F1-score for the categories reported in our analysis.

\begin{table}[t]
\centering
\small
\setlength{\tabcolsep}{4pt}
\begin{tabular}{lccc}
\hline
Category & Precision & Recall & F1-Score \\
\hline
Political & 0.38 & 0.65 & 0.48 \\
Religious & 0.08 & 0.24 & 0.12 \\
Linguistic & 0.22 & 0.51 & 0.31 \\
Macro Average & 0.16 & 0.61 & 0.23 \\
\hline
\end{tabular}
\caption{Per-category performance of our system on selected polarization classes.}
\label{tab:per-class-results}
\end{table}

\begin{itemize}
    \item \textbf{Political (F1: 0.48):} This is the best-performing category. The model successfully captures political entities and keywords such as party acronyms (e.g., PDP, APC).%, which act as strong lexical signals.
    
    \item \textbf{Religious (F1: 0.12):} Performance remains comparatively weak in this category. The model struggles to distinguish between neutral religious expressions (e.g., prayers) and genuinely polarized content, resulting in a substantial number of false positives.
    
    \item \textbf{Linguistic (F1: 0.31):} This category shows moderate performance, suggesting that the model can capture some lexical and stylistic markers of linguistic polarization, although there is still room for improvement.
    
    \item \textbf{Macro Average (F1: 0.23):} The overall macro-level result reflects the imbalance challenge of the task: while recall is relatively strong, precision remains limited across categories, lowering the final average performance.
\end{itemize}

\subsection{Impact of Imbalance Handling}

Our results confirm the importance of addressing class imbalance in low-resource classification tasks. Dynamic undersampling exposes the model to a more balanced distribution during training, improving its ability to detect minority classes. At the same time, the weighted loss function increases the contribution of rare labels to the optimization objective.

However, these strategies also introduce a trade-off: while recall improves, precision decreases. This shows the difficulty of balancing detection sensitivity and prediction reliability in highly skewed datasets.

Figure~\ref{fig:analysis-visual} provides a complementary visual illustration for the detailed analysis discussed in this section.

\begin{figure}[t]
\centering
\includegraphics[width=\linewidth]{uploads/figure2.png}
\caption{Visual illustration accompanying the detailed analysis of system behavior and class-level performance.}
\label{fig:analysis-visual}
\end{figure}

\subsection{Error Analysis}

A careful study of our model predictions reveals some error patterns. First, the model often misclassifies neutral religious expressions as polarized content due to lexical overlap. Second, ambiguous statements are frequently misinterpreted, showing the limitations of relying only on textual cues without broader context. Finally, the scarcity of training examples in some categories limits the model's ability to generalize effectively.

These observations suggest directions for future work, including incorporating richer contextual information, applying data augmentation techniques, and exploring calibration methods to better manage the precision–recall trade-off.

Figure~\ref{fig:error-analysis-visual} provides an additional visual summary related to the error patterns and qualitative analysis discussed above.

\begin{figure}[t]
\centering
\includegraphics[width=\linewidth]{uploads/figure3.png}
\caption{Additional visual summary supporting the error analysis and qualitative discussion of system predictions.}
\label{fig:error-analysis-visual}
\end{figure}

%\section{Results and Analysis}

%Our official results on the test set are summarized in Table 1. Since the official Subtask 2 ranking metric is Macro-F1, we report it first and treat Micro-F1, recall, and precision as supporting diagnostics.

%\begin{table}[h]
%\centering
%\begin{tabular}{lc}
%\hline
%Metric & Score \\
%\hline
%F1 Macro (Official) & 0.2346 \\
%F1 Micro & 0.2581 \\
%Recall Micro & 0.6166 \\
%Precision Micro & 0.1632 \\
%\hline
%\end{tabular}
%\caption{Official Subtask 2 Results. Macro-F1 is the official ranking metric; recall-oriented tuning yields high recall but lower precision.}
%\end{table}

%\subsection{Per-Class Performance}

%\begin{itemize}
 %   \item \textbf{Political (F1: 0.48):} Our best performing category. The model successfully identified political entities (e.g., party acronyms like PDP, APC).
 %   \item \textbf{Racial/Ethnic (F1: 0.34):} The model learned ethnic slurs effectively.
  %  \item \textbf{Religious (F1: 0.17):} This category suffered from high False Positives. The model struggled to differentiate between piety (invoking God) and polarization.
  %  \item \textbf{Gender/Sexual (F1: 0.13):} Despite being the rarest class, our Weighted Loss strategy allowed the model to detect it, preventing ``Zero-Shot'' failure.
%\end{itemize}

\section{Conclusion}

Our participation in SemEval-2026 highlights the trade-off between precision and recall in highly imbalanced datasets. By using Afro-XLMR-Large and dynamic undersampling, we built a system that is highly sensitive to polarization (recall above 60\%). While this results in more false positives, it shows that low-resource models can successfully learn to flag harmful content even with limited training examples.

For reproducibility, we will release training scripts, configuration files, and inference code after the official evaluation period.

\section*{Ethical Considerations}

Our system is designed as a decision-support tool for moderation, not a fully autonomous judge. Because false positives may disproportionately affect dialectal and identity-linked expressions, human review remains necessary before punitive actions are taken.

%\section{System Overview}

%\subsection{Model Architecture}
%We employed Afro-XLMR-Large \citep{alabi2022}, a variant of XLM-ROBERTa pre-trained specifically on 17 African languages, including Hausa. We chose the ``Large'' variant over ``Base'' to leverage its superior contextual understanding, which is crucial for distinguishing between neutral religious mentions (e.g., prayers) and religious polarization (e.g., sectarian attacks).

%%The training data was heavily skewed. To address this, we implemented a two-pronged strategy:

%\paragraph{Dynamic Undersampling (1:3 Ratio):} Instead of static dataset reduction, we utilized a custom PyTorch \texttt{WeightedRandomSampler} during the DataLoader instantiation. This sampler dynamically constructs each batch to ensure a 1:3 ratio of polarized to non-polarized examples. This effectively prevented the model from collapsing into majority-class prediction while preserving exposure to varying non-polarized contexts across epochs.

%\paragraph{Weighted Loss Function:} We replaced standard Cross Entropy with \texttt{BCEWithLogitsLoss}, applying calculated positive class weights ($w_{c}$) for each label $c$. The loss function is defined as:

%\begin{equation}
%\label{eq:loss}
%\mathcal{L}_c = - \left( w_c \cdot y_c \cdot \log(\sigma(x_c)) + (1-y_c) \cdot \log(1 - \sigma(x_c)) \right)
%\end{equation}

%Where $w_{c}$ was dynamically calculated based on the inverse frequency of the class in the training set. Rare classes like Gender/Sexual were assigned significantly higher weights ($\approx6.1$) compared to common classes like Political ($\approx3.3$).

%\section{Experimental Setup}
%We implemented our system using the Hugging Face Transformers library.
%\begin{itemize}
 %   \item \textbf{Tokenizer:} Afro-XLMR-Large Tokenizer (Max Length: 128).
  %  \item \textbf{Optimizer:} AdamW with learning rate 2e-5.
  %  \item \textbf{Batch Size:} 4 (with Gradient Accumulation = 4).
  %  \item \textbf{Epochs:} 6.
  %  \item \textbf{Hardware:} NVIDIA T4 GPU.
%\end{itemize}

%\section{Results and Analysis}
%Our official results on the test set are summarized in Table~\ref{tab:results}, demonstrating a clear improvement over the official SemEval baseline.

%\begin{table}[h]
%\centering
%\begin{tabular}{lcc}
%\toprule
%\textbf{System} & \textbf{Macro-F1} & \textbf{Micro-F1} \\
%\midrule
%POLAR Baseline & 0.2038 & - \\
%\textbf{Team HausaNLP (Ours)} & \textbf{0.2346} & \textbf{0.2581} \\
%\bottomrule
%\end{tabular}
%\caption{Official Subtask 2 Results (Hausa). Our dynamic undersampling approach outperforms the official task baseline on the primary ranking metric.}
%\label{tab:results}
%\end{table}

%Our recall-oriented tuning successfully maximized the detection of marginalized classes (Micro-Recall: 0.6166), though this resulted in lower precision (0.1632).

%\subsection{Error Analysis and Per-Class Performance}
%\begin{itemize}
 %   \item \textbf{Political (F1: 0.48):} Our best performing category. The model successfully identified political entities (e.g., party acronyms like PDP, APC).
%    \item \textbf{Racial/Ethnic (F1: 0.34):} The model learned ethnic slurs effectively.
%    \item \textbf{Religious (F1: 0.17):} This category suffered from high False Positives. The model struggled to differentiate between piety (invoking God) and polarization.
 %   \item \textbf{Gender/Sexual (F1: 0.13):} Despite being the rarest class, our Weighted Loss strategy allowed the model to detect it, preventing ``Zero-Shot'' failure.
%\end{itemize}

%The low precision score is largely attributable to dialectal polysemy in Hausa. For example, religious terms used for benign blessings are lexically identical to those used in sectarian hostility, confusing the context window.

%\section{Conclusion}
%Our participation in SemEval-2026 highlights the trade-off between Precision and Recall in highly imbalanced datasets. By using Afro-XLMR-Large and dynamic undersampling, we built a system highly sensitive to polarization. While this results in false positives, it proves that low-resource models can successfully flag harmful content. 

%\section*{Ethical Considerations}
%Our system is designed as a decision-support tool for moderation, not an autonomous judge. Because false positives may disproportionately affect dialectal expressions, human review remains necessary.

% --- REFERENCES ---
\begin{thebibliography}{5}
\expandafter\ifx\csname natexlab\endcsname\relax\def\natexlab#1{#1}\fi

\bibitem[{Naseem et~al.(2026a)}]{naseem2026polarbenchmarkmultilingualmulticultural}
Usman Naseem, Robert Geislinger, Juan Ren, Sarah Kohail, Rudy Garrido Veliz, P Sam Sahil, Yiran Zhang, Marco Antonio Stranisci, Idris Abdulmumin, Özge Alacam, Cengiz Acartürk, Aisha Jabr, Saba Anwar, Abinew Ali Ayele, Simona Frenda, Alessandra Teresa Cignarella, Elena Tutubalina, Oleg Rogov, Aung Kyaw Htet, Xintong Wang, Surendrabikram Thapa, Kritesh Rauniyar, Tanmoy Chakraborty, Arfeen Zeeshan, Dheeraj Kodati, Satya Keerthi, Sahar Moradizeyveh, Firoj Alam, Arid Hasan, Syed Ishtiaque Ahmed, Ye Kyaw Thu, Shantipriya Parida, Ihsan Ayyub Qazi, Lilian Wanzare, Nelson Odhiambo Onyango, Clemencia Siro, Jane Wanjiru Kimani, Ibrahim Said Ahmad, Adem Chanie Ali, Martin Semmann, Chris Biemann, Shamsuddeen Hassan Muhammad, and Seid Muhie Yimam. 2026.
\newblock POLAR: A benchmark for multilingual, multicultural, and multi-event online polarization.
\newblock \emph{arXiv preprint arXiv:2505.20624}.
\newblock URL \url{https://arxiv.org/abs/2505.20624}.

\bibitem[{Naseem et~al.(2026b)}]{naseem-etal-2026-polar}
Usman Naseem, Robert Geislinger, Juan Ren, Sarah Kohail, Rudy Garrido Veliz, P Sam Sahil, Yiran Zhang, Marco Antonio Stranisci, Idris Abdulmumin, Özge Alacam, Cengiz Acartürk, Aisha Jabr, Saba Anwar, Abinew Ali Ayele, Elena Tutubalina, Aung Kyaw Htet, Xintong Wang, Surendrabikram Thapa, Tanmoy Chakraborty, Dheeraj Kodati, Sahar Moradizeyveh, Firoj Alam, Ye Kyaw Thu, Shantipriya Parida, Ihsan Ayyub Qazi, Nelson Odhiambo Onyango, Clemencia Siro, Ibrahim Said Ahmad, Lilian Wanzare, Adem Chanie Ali, Martin Semmann, Chris Biemann, Shamsuddeen Hassan Muhammad, and Seid Muhie Yimam. 2026.
\newblock SemEval-2026 Task 9: Detecting multilingual, multicultural and multievent online polarization.
\newblock In \emph{Proceedings of the 20th International Workshop on Semantic Evaluation (SemEval-2026)}, San Diego, CA, USA. Association for Computational Linguistics.

\bibitem[{Devlin et~al.(2019)}]{devlin2019bert}
Jacob Devlin, et~al. 2019.
\newblock BERT: Pre-training of deep bidirectional transformers for language understanding.
\newblock In \emph{Proceedings of NAACL}.

\bibitem[{Waseem and Hovy(2016)}]{waseem-hovy-2016-hateful}
Zeerak Waseem and Dirk Hovy. 2016.
\newblock Hateful symbols or hateful people? Predictive features for hate speech detection on Twitter.
\newblock In \emph{Proceedings of SRW at HLT-NAACL}.

\bibitem[{Garimella et~al.(2018)}]{garimella2018quantifying}
Kiran Garimella, Gianmarco De Francisci Morales, Aristides Gionis, and Michael Mathioudakis. 2018.
\newblock Quantifying controversy on social media.
\newblock \emph{ACM Transactions on Social Computing}, 1(1):1--27.

\bibitem[{Adelani et~al.(2022)}]{adelani-etal-2022-masakhaner}
David Adelani, et~al. 2022.
\newblock MasakhaNER 2.0: Africa-centric Transfer Learning for Named Entity Recognition.
\newblock In \emph{Proceedings of EMNLP}.

\bibitem[{Conneau et~al.(2020)}]{conneau2020}
Alexis Conneau, et~al. 2020.
\newblock Unsupervised cross-lingual representation learning at scale.
\newblock In \emph{Proceedings of ACL}.

\bibitem[{Alabi et~al.(2022)}]{alabi2022}
Jesujoba Alabi, Kwabena Amponsah, and David Adelani. 2022.
\newblock Adapting pre-trained language models to African languages via multilingual adaptive fine-tuning.
\newblock In \emph{Proceedings of COLING}.

\bibitem[{Chawla et~al.(2002)}]{chawla2002smote}
Nitesh~V. Chawla, Kevin~W. Bowyer, Lawrence~O. Hall, and W.~Philip Kegelmeyer. 2002.
\newblock SMOTE: Synthetic minority over-sampling technique.
\newblock \emph{Journal of artificial intelligence research}, 16:321--357.

\bibitem[{He and Garcia(2009)}]{he2009learning}
Haibo He and Edwardo A. Garcia. 2009.
\newblock Learning from imbalanced data.
\newblock \emph{IEEE Transactions on Knowledge and Data Engineering}.

\bibitem[{Lin et~al.(2017)}]{lin2017focal}
Tsung-Yi Lin, et~al. 2017.
\newblock Focal loss for dense object detection.
\newblock In \emph{Proceedings of ICCV}.







%\bibitem[{Naseem et~al.(2026)}]{semeval2026task9}
%Usman Naseem, et~al. 2026.
%\newblock SemEval-2026 Task 9: Polarization detection in social media comments.
%\newblock \emph{SemEval-2026 Proceedings}.


\end{thebibliography}

\end{document}