\documentclass[11pt]{article}
% \usepackage[review]{acl}
\usepackage{acl}           
\usepackage{times}
\usepackage{latexsym}
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{url}
\usepackage{microtype}
\usepackage{inconsolata}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{amsmath}
\usepackage{enumitem}
\usepackage{xcolor}
\definecolor{paperbg}{RGB}{248,244,232}   
\definecolor{framegray}{RGB}{90,90,90}    
\usepackage{placeins}
\usepackage{tikz}
\usetikzlibrary{arrows.meta,positioning,fit}
\usepackage[most]{tcolorbox}
\usepackage{enumitem}
\usepackage{xcolor}

\definecolor{pipeblue}{RGB}{44,102,176}
\definecolor{pipebg}{RGB}{247,250,255}
\definecolor{stagebg}{RGB}{252,252,252}
\definecolor{accent}{RGB}{28,140,120}
\title{DANGNT@SGU at SemEval-2026 Task 1: \\
A Two-Stage Mistral Generator with DistilBERT Reranking for English Humor Generation}

\author{
Tan Loc Nguyen\textsuperscript{\dag},
Dang Tuan Nguyen\textsuperscript{\ddag}\thanks{Corresponding author: Dang Tuan Nguyen (\texttt{dangnt@sgu.edu.vn}).} \\
\textsuperscript{\dag}Faculty of Information Technology, Ton Duc Thang University, Ho Chi Minh City, Vietnam \\
\textsuperscript{\ddag}Faculty of Information Technology, Saigon University, Ho Chi Minh City, Vietnam \\
\texttt{nguyenloctan.0409@gmail.com} \quad \texttt{dangnt@sgu.edu.vn}
}

\begin{document}
\maketitle

\begin{abstract}
We describe DANGNT@SGU's system for the English track of SemEval-2026 Task~1 (MWAHAHA), Subtask~A (text-based humor generation). Our pipeline combines a two-stage QLoRA-adapted generator based on \texttt{mistralai/Mistral-7B-Instruct-v0.2} with a DistilBERT reranker trained to distinguish jokes from non-jokes. The generator is first adapted on a raw joke corpus for general humor style, then further tuned on synthetic task-format instruction--response pairs for Word Inclusion and News Headline prompts. At inference time, we generate five candidates per input, optionally enforce lexical constraints for Word Inclusion prompts, and rerank candidates with the classifier. In the official English Subtask~A results, our team \texttt{DANGNT@SGU} obtained Elo 962 (95\% CI: 926--986), ranking 13th. The system is practical, reproducible, and based entirely on open models and public data.
\end{abstract}

\section{Introduction}
SemEval-2026 Task~1 (MWAHAHA) studies constrained humor generation; in Subtask~A, systems generate jokes under explicit constraints such as mandatory word inclusion or a news-headline prompt \citep{semeval2026mwahaha}. The task is evaluated with human pairwise judgments aggregated into Elo-style rankings, making output quality and perceived funniness the main optimization target.

We describe our \textbf{English-only} submission, which follows a simple engineering-oriented design: a QLoRA-adapted Mistral-7B-Instruct generator plus a lightweight DistilBERT reranker. Because the task does not provide gold joke targets for the official prompts, we create synthetic supervision from a public joke corpus and the official English input file. Our team \texttt{DANGNT@SGU} ranked \textbf{13th} in the English leaderboard with Elo \textbf{962} (95\% CI: 926--986) \citep{mwahaha2026results}. The main contributions are: (1) a two-stage generator adaptation strategy (general humor $\rightarrow$ task constraints), (2) a simple joke-vs.-non-joke reranker for candidate selection, and (3) a best-of-5 inference pipeline with optional lexical constraint enforcement for Word Inclusion prompts.

\section{Task Setup}
We participate in SemEval-2026 Task~1 (MWAHAHA), Subtask~A, English track \citep{semeval2026mwahaha}. The task includes two prompt types:
\begin{itemize}[leftmargin=1.2em]
    \item \textbf{Word Inclusion}: generate a short joke that must include two specified words.
    \item \textbf{News Headline}: generate a short joke related to a given news headline.
\end{itemize}
The official English TSV is used at inference time, but no gold joke targets are provided for direct supervised training on the official prompts; this motivates our synthetic-data pipeline. Official evaluation is human-centered and pairwise, and leaderboard scores are reported as Elo ratings with confidence intervals, which motivates our best-of-5 generation and reranking strategy.

\section{Method}
\begin{figure}[t]
    \centering
    \setlength{\fboxsep}{0pt}

    \begin{tcolorbox}[
        enhanced,
        width=0.97\linewidth,
        colback=pipebg,
        colframe=pipeblue,
        boxrule=0.7pt,
        arc=2mm,
        left=4pt,right=4pt,top=4pt,bottom=4pt,
        title=\textsc{Pipeline Summary},
        coltitle=white,
        colbacktitle=pipeblue,
        fonttitle=\small\bfseries
    ]

    % Stage 1
    \begin{tcolorbox}[
        colback=stagebg,colframe=black!20,boxrule=0.4pt,arc=1.2mm,
        left=3pt,right=3pt,top=2pt,bottom=2pt
    ]
    \small
    \textbf{1) Synthetic data construction} \hfill {\color{accent}\textbf{offline}}\\[-1pt]
    \begin{itemize}[leftmargin=1.2em,itemsep=1pt,topsep=1pt,parsep=0pt]
        \item Build \textbf{Word Inclusion} instruction--response pairs from jokes (keyword-based).
        \item Build \textbf{News Headline} pairs by headline--joke matching (embedding similarity).
    \end{itemize}
    \end{tcolorbox}

    \vspace{1pt}\centering{\footnotesize$\Downarrow$}\vspace{1pt}

    % Stage 2
    \begin{tcolorbox}[
        colback=stagebg,colframe=black!20,boxrule=0.4pt,arc=1.2mm,
        left=3pt,right=3pt,top=2pt,bottom=2pt
    ]
    \small
    \textbf{2) Generator training (Stage 1)} \hfill {\color{accent}\textbf{QLoRA}}\\[-1pt]
    Fine-tune \textbf{Mistral-7B-Instruct} on raw jokes to learn short-joke style and punchline rhythm.
    \end{tcolorbox}

    \vspace{1pt}\centering{\footnotesize$\Downarrow$}\vspace{1pt}

    % Stage 3
    \begin{tcolorbox}[
        colback=stagebg,colframe=black!20,boxrule=0.4pt,arc=1.2mm,
        left=3pt,right=3pt,top=2pt,bottom=2pt
    ]
    \small
    \textbf{3) Generator training (Stage 2)} \hfill {\color{accent}\textbf{task adaptation}}\\[-1pt]
    Continue QLoRA training on synthetic prompts so the model follows shared-task formats
    (\textit{Word Inclusion} + \textit{News Headline}).
    \end{tcolorbox}

    \vspace{1pt}\centering{\footnotesize$\Downarrow$}\vspace{1pt}

    % Stage 4
    \begin{tcolorbox}[
        colback=stagebg,colframe=black!20,boxrule=0.4pt,arc=1.2mm,
        left=3pt,right=3pt,top=2pt,bottom=2pt
    ]
    \small
    \textbf{4) Inference + reranking} \hfill {\color{accent}\textbf{best-of-5}}\\[-1pt]
    \begin{itemize}[leftmargin=1.2em,itemsep=1pt,topsep=1pt,parsep=0pt]
        \item Generate \textbf{5 candidates} per prompt (sampling).
        \item Optionally enforce \textbf{word inclusion} with regex checks (Word Inclusion only).
        \item Score candidates with a \textbf{DistilBERT reranker} and select the top output.
    \end{itemize}
    \end{tcolorbox}

    \vspace{3pt}

    % Output strip
    \begin{tcolorbox}[
        colback=pipeblue!6,colframe=pipeblue!50,boxrule=0.4pt,arc=1.2mm,
        left=3pt,right=3pt,top=2pt,bottom=2pt
    ]
    \small
    \textbf{Outputs:} reranker-ranked files 
    \end{tcolorbox}

    \end{tcolorbox}

    \caption{Compact overview of our two-stage generator + DistilBERT reranker pipeline for English Subtask A.}
    \label{fig:pipeline_compact}
\end{figure}
The overall system is illustrated in Figure~\ref{fig:pipeline_compact}, and can be summarized into four major stages: (1) synthetic data preparation, (2) Stage~1 generator training (general humor adaptation), (3) Stage~2 generator training (constraint adaptation), and (4) best-of-5 inference with reranking.


\subsection{Data sources}
We use three public data sources:
\begin{itemize}[leftmargin=1.2em]
    \item \textbf{Humor corpus}: the \textit{Short Jokes} dataset on Kaggle by Abhinav Moudgil (\texttt{shortjokes.csv})~\citep{moudgil_short_jokes}, used for Stage~1 humor-style adaptation and for synthetic pair generation. This dataset has also been used in prior humor research ~\citep{chen-soo-2018-humor}.
    \item \textbf{Task inputs}: the official English input file for Subtask~A (TSV), which contains both Word Inclusion and News Headline prompts.
    \item \textbf{Non-humor corpus for reranker negatives}: English Wikipedia from Hugging Face (\texttt{wikimedia/wikipedia}, snapshot \texttt{20231101.en}) \citep{hf_wikimedia_wikipedia}, used as negative examples for DistilBERT reranker training.
\end{itemize}

Because the task does not provide gold target jokes for the official prompts, we generate synthetic supervision by constructing task-format instruction–response examples using a public joke corpus (and official headlines for NH), with responses sampled or matched from the joke corpus.

\subsection{Synthetic data preparation}
\paragraph{Word Inclusion synthetic pairs.}
For each joke in the joke corpus, we extract keyword candidates using POS tagging and dependency parsing (spaCy). We prioritize content words (NOUN, PROPN, ADJ, VERB) and filter out stopwords and punctuation. We then select two keywords and build an instruction of the form:
\begin{quote}\small
Create a short joke that includes the words ``\texttt{w1}'' and ``\texttt{w2}''. 
\end{quote}
The original joke text is used as the response. This yields a large set of synthetic word-inclusion training pairs.

\paragraph{News Headline synthetic pairs.}
To construct headline-style supervision, we embed official news headlines and joke texts using a sentence-transformer model. For each headline, we retrieve jokes whose cosine similarity exceeds a threshold (0.4), and create instruction--response pairs:
\begin{quote}\small
Create a short joke related to this headline: ``...''
\end{quote}
This procedure introduces topical supervision, although it is noisy because the joke corpus is not natively aligned to real news.

\paragraph{Data balancing.}
The Word Inclusion synthetic pairs are much more numerous than the News Headline pairs. To avoid overfitting to one prompt type, we cap the ratio used in Stage~2 training (production setting: approximately 10:1 for Word Inclusion:News Headline).

\subsection{Two-stage generator training}
\paragraph{Backbone model and adaptation.}
We use \texttt{mistralai/Mistral-7B-Instruct-v0.2} as the generator. Training is performed with 4-bit quantization (QLoRA-style setup) and LoRA adapters inserted into attention projection layers (q, k, v, o).

\paragraph{Stage~1: general humor style adaptation.}
In Stage~1, we train the generator directly on the raw joke corpus as an instruction-following generation task, so the model learns short-joke style, rhythm, and punchline structure.

\paragraph{Stage~2: task-format adaptation.}
In Stage~2, we continue fine-tuning from the Stage~1 adapters using the synthetic task-format prompts (Word Inclusion + News Headline). This stage teaches the model to follow the exact prompt structures expected by the shared task while preserving the humor style learned in Stage~1.

\subsection{DistilBERT reranker}
We train a lightweight reranker to score humor quality proxies:
\begin{itemize}[leftmargin=1.2em]
    \item \textbf{Positive class}: jokes from the humor corpus
    \item \textbf{Negative class}: non-humorous text sampled from English Wikipedia (Hugging Face \texttt{wikimedia/wikipedia}, snapshot \texttt{20231101.en})
\end{itemize}

The reranker is a binary DistilBERT classifier. At inference time, we use its output score to choose the better candidate among multiple samples from the generator.

\subsection{Inference and submission generation}
At inference time, the pipeline reads the official English file and formats prompts automatically based on the row type (Word Inclusion vs. News Headline). We generate \textbf{five candidates} per input using sampling (best-of-5), then score all five candidates with the DistilBERT reranker and keep the top-ranked output for the main submission.

\section{Experimental Setup}
\subsection{Resources and software}
We use the following main components:
{\small
\begin{itemize}[leftmargin=1.1em,itemsep=0.15em,topsep=0.2em]
    \item \textbf{Generator}: Mistral-7B-Instruct-v0.2 (HF id: \url{mistralai/Mistral-7B-Instruct-v0.2}) \citep{jiang2023mistral}
    \item \textbf{Reranker}: DistilBERT (\url{distilbert-base-uncased}) \citep{sanh2019distilbert}
    \item \textbf{Headline--joke matching}: Sentence-Transformers model \url{all-MiniLM-L6-v2} \citep{reimers2019sentencebert}
    \item \textbf{Linguistic preprocessing}: spaCy (\url{en_core_web_sm}) \citep{honnibal2020spacy}
    \item \textbf{Frameworks}: Transformers, Datasets, PEFT/LoRA, and TRL \citep{wolf2020transformers, lhoest2021datasets, hu2022lora, dettmers2023qlora}
\end{itemize}
}

\subsection{Main hyperparameters}
Table~\ref{tab:hyperparams} summarizes the main hyperparameters and training/inference settings used in our final English Subtask~A system. We include the generator, reranker, and decoding configurations to support reproducibility.

\begin{table}[!t]
\centering
\scriptsize
\setlength{\tabcolsep}{3pt}
\begin{tabular}{p{0.29\columnwidth} p{0.66\columnwidth}}
\toprule
\textbf{Component} & \textbf{Configuration} \\
\midrule
Stage 1 generator & Mistral-7B-Instruct-v0.2 + QLoRA (4-bit NF4, bf16 compute) \\
Stage 1 LoRA & $r=64$, $\alpha=16$, dropout $=0.1$, target modules = q/k/v/o projections \\
Stage 1 training & 3 epochs, batch size 16, grad accum 1, LR $2\times10^{-4}$, cosine schedule, warmup 0.03 \\
Stage 2 generator & Continue training from Stage~1 adapters (same base model, 4-bit) \\
Stage 2 balancing & Word:News = 10:1 cap (enabled in production mode) \\
Stage 2 training & 3 epochs, batch size 16, grad accum 1, LR $1\times10^{-4}$, cosine schedule, warmup 0.03 \\
Reranker data & Positive = jokes, Negative = \texttt{wikimedia/wikipedia} (\texttt{20231101.en}) \\
Reranker & DistilBERT binary classifier, max length 128, train/test split = 90/10 \\
Reranker training & 1 epoch, batch size 64, LR $2\times10^{-5}$, eval/save steps = 2000 \\
Inference & best-of-5 generation, reranker selection, optional regex word filter \\
Decoding defaults & temperature = 0.7, top-k = 50, max new tokens = 75, seed = 42 \\
\bottomrule
\end{tabular}
\caption{Main system settings used in our English Subtask~A pipeline.}
\label{tab:hyperparams}
\end{table}

\subsection{Reproducibility}
We release the full implementation and scripts for our submission pipeline.\footnote{\url{https://github.com/tanloc49/dangnt-sgu-mwahaha}}. The script exports reranker-ranked outputs from top-1 to top-5 for each input. We submitted one official runs: \texttt{task-a-en\_top1.tsv} (primary). Outputs \texttt{top2}--\texttt{top5} are provided for local inspection and reproducibility analysis. We also document the training schedule, key hyperparameters, and exact scripts used in the final pipeline. The system relies only on open pretrained models and public datasets, and our task-specific supervision is generated by rule-based preprocessing (keyword extraction, embedding-based headline pairing, and threshold filtering), which is deterministic once the seed and preprocessing configuration are fixed. For replication, we recommend using the library versions listed in the repository and the same decoding settings reported in this paper. Minor variation may still occur across hardware/software environments when sampled generation is used.

\section{Results and Analysis}
\begin{table}[t]
\centering
\small
\begin{tabular}{l c}
\toprule
\textbf{Item} & \textbf{Value} \\
\midrule
Leaderboard name & \texttt{DANGNT@SGU} \\
Subtask / language & Subtask~A / English \\
Official Elo & 962 \\
95\% confidence interval & [926, 986] \\
Official rank & 13 \\
\bottomrule
\end{tabular}
\caption{Official result reported by the task organizers for our English submission.}
\label{tab:official-results}
\end{table}

\subsection{Official SemEval result}
Table~\ref{tab:official-results} reports the official English Subtask~A result of our primary run (reranker top-1 selection) \citep{mwahaha2026results}. The official evaluation is based on human pairwise judgments aggregated into Elo, so local automatic metrics should be interpreted only as proxy diagnostics.

\subsection{Local proxy ablation protocol}
Because the organizers do not release gold target jokes for the official prompts, we evaluate system variants on a local dev-style set for controlled comparison. Our local set contains 100 prompts sampled from the official English input format (50 Word Inclusion, 50 News Headline), and is used only for internal ablation analysis.

We compare three configurations to isolate the effect of stage-wise generator training under a fixed decoding/selection setting (top-1 output; best-of-1) and without reranking: (i) zero-shot Mistral-7B-Instruct, (ii) Stage~2-only is trained from the same base model as zero-shot (no Stage~1), using only the synthetic task-format data, and (iii) Stage~1+Stage~2. For Word Inclusion prompts, we measure raw lexical compliance without any post-hoc regeneration; the optional regex-based regeneration described in our inference procedure is used only in the final submission pipeline.

We report two lightweight diagnostics:
\begin{itemize}[leftmargin=1.2em]
    \item \textbf{W.I. Success (\%)}: percentage of Word Inclusion outputs that contain both required words (case-insensitive word-boundary regex).
    \item \textbf{Avg. DistilBERT}: mean reranker score of the final selected output (proxy for joke-likeness).
\end{itemize}

\subsection{Quantitative ablation results}
\begin{table*}[t]
\centering
\small
\renewcommand{\arraystretch}{1.25}
\setlength{\tabcolsep}{6pt}
\begin{tabular}{lcc}
\toprule
\textbf{Pipeline configuration} & \textbf{W.I. Success (\%) $\uparrow$} & \textbf{Avg. DistilBERT $\uparrow$} \\
\midrule
Zero-shot Mistral-7B-Instruct & 68 & 0.44 \\
Stage~2 only (task-format adaptation) & 96 & 0.58 \\
Stage~1 + Stage~2 (humor-style + format) & \textbf{96} & \textbf{0.74} \\
\bottomrule
\end{tabular}
\caption{Stage-wise local proxy comparison on a 100-prompt dev-style set (50 Word Inclusion, 50 News Headline), using the top-1 output (best-of-1, no reranker). W.I. Success is computed on the Word Inclusion subset; Avg. DistilBERT is the mean score of our DistilBERT joke-likeness classifier over all 100 final outputs. These proxy metrics are for internal system comparison only and are not directly comparable to the official human Elo evaluation.}
\label{tab:local-ablation}
\end{table*}

Table~\ref{tab:local-ablation} highlights two main trends under a fixed decoding/selection setting (top-1 output). First, task-format adaptation (Stage~2) is the main driver of lexical constraint satisfaction, improving W.I. Success substantially over zero-shot prompting (68\% $\rightarrow$ 96\%, i.e., 34/50 $\rightarrow$ 48/50 on Word Inclusion). Second, Stage~1 humor-style adaptation improves the joke-likeness proxy strongly (0.58 $\rightarrow$ 0.74) while maintaining the same W.I. Success, suggesting that the two-stage design helps separate \emph{format following} from \emph{humor style learning}.

Overall, this stage-wise ablation supports the final design choice: Stage~2 is necessary for prompt compliance, and Stage~1 improves stylistic quality. At the same time, these numbers remain proxy-based and do not replace human preference evaluation.

\subsection{Error analysis}
Our local error inspection suggests three common failure modes. \textbf{(1) Constraint--quality gap:} for Word Inclusion prompts, the model often includes both required words but produces a weak or literal punchline, indicating that lexical compliance alone does not guarantee humor quality. \textbf{(2) Headline relevance drift:} for News Headline prompts, outputs are often joke-like but only loosely related to the headline content, likely due to the noise in our synthetic headline--joke matching procedure. \textbf{(3) Reranker preference bias:} the DistilBERT reranker sometimes favors short, generic joke patterns over more creative candidates, because it is trained as a binary joke/non-joke classifier rather than on human pairwise preferences. These observations complement our stage-wise ablation and motivate future work on task-aware reranking and stronger prompt--response relevance modeling.

\section{Discussion}
Our results suggest that the system benefits from a modular design: Stage~1 learns short-joke style, Stage~2 adapts the model to the shared-task prompt formats, and reranking provides a practical selection rule under stochastic decoding (our reranker top-1 run outperforms the top-2 run in official Elo). This decomposition also makes the pipeline easier to debug because each component has a distinct role.

The ablation study also clarifies the limits of the current approach. The DistilBERT reranker is helpful as a \emph{joke-likeness} proxy, but it is not trained on human pairwise preferences, so improvements in Avg. DistilBERT may not translate directly into Elo gains. Similarly, our synthetic News Headline supervision improves fluency and format compliance but does not fully solve headline relevance.

Two practical directions follow from these findings. First, a task-aware reranker trained on prompt--response relevance (and ideally human preference signals) would better match the official evaluation objective. Second, constrained decoding could replace or reduce regex-based post-checking for Word Inclusion prompts, improving robustness without relying on reactive filtering.

\section{Conclusion}
We presented DANGNT@SGU's English system for SemEval-2026 Task~1 (MWAHAHA) Subtask~A: a two-stage QLoRA-adapted Mistral generator with DistilBERT reranking for Word Inclusion and News Headline prompts. Our team \texttt{DANGNT@SGU} ranked 13th on the English leaderboard (Elo 962). The pipeline is lightweight, reproducible, and fully based on open models and public data, providing a practical baseline for controllable humor generation.


% Single-file bibliography
\begin{thebibliography}{}

\bibitem[Castro et~al.(2026)]{semeval2026mwahaha}
Santiago Castro, Luis Chiruzzo, Santiago G{\'o}ngora, Salar Rahili, Naihao Deng, Ignacio Sastre, Victoria Amoroso, Guillermo Rey, Aiala Ros{\'a}, Guillermo Moncecchi, J.~A. Meaney, Juan Jos{\'e} Prada, and Rada Mihalcea. 2026.
SemEval-2026 Task~1: MWAHAHA, Models Write Automatic Humor And Humans Annotate.
In \emph{Proceedings of the 20th International Workshop on Semantic Evaluation (SemEval-2026)}.

\bibitem[Castro and Chiruzzo(2026)]{mwahaha2026submission}
Santiago Castro and Luis Chiruzzo. 2026.
Submission list, annotation, and paper submission.
Google Groups post to the \emph{semeval-2026-task-1-humor-gen} mailing list, February 3, 2026.

\bibitem[Chiruzzo(2026)]{mwahaha2026results}
Luis Chiruzzo. 2026.
Final results.
Google Groups post to the \emph{semeval-2026-task-1-humor-gen} mailing list, February 21, 2026.

\bibitem[Jiang et~al.(2023)]{jiang2023mistral}
Albert Q. Jiang et~al. 2023.
Mistral 7B.
arXiv preprint arXiv:2310.06825.

\bibitem[Sanh et~al.(2019)]{sanh2019distilbert}
Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf. 2019.
DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter.
arXiv preprint arXiv:1910.01108.

\bibitem[Reimers and Gurevych(2019)]{reimers2019sentencebert}
Nils Reimers and Iryna Gurevych. 2019.
Sentence-BERT: Sentence embeddings using Siamese BERT-networks.
In \emph{Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing (EMNLP-IJCNLP)}.

\bibitem[Honnibal et~al.(2020)]{honnibal2020spacy}
Matthew Honnibal, Ines Montani, Sofie Van~Landeghem, and Adriane Boyd. 2020.
spaCy: Industrial-strength Natural Language Processing in Python.
Zenodo.

\bibitem[Wolf et~al.(2020)]{wolf2020transformers}
Thomas Wolf et~al. 2020.
Transformers: State-of-the-art natural language processing.
In \emph{Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations}.

\bibitem[Lhoest et~al.(2021)]{lhoest2021datasets}
Quentin Lhoest et~al. 2021.
Datasets: A community library for natural language processing.
In \emph{Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations}.

\bibitem[Hu et~al.(2022)]{hu2022lora}
Edward J. Hu et~al. 2022.
LoRA: Low-rank adaptation of large language models.
In \emph{International Conference on Learning Representations (ICLR)}.

\bibitem[Dettmers et~al.(2023)]{dettmers2023qlora}
Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. 2023.
QLoRA: Efficient finetuning of quantized LLMs.
In \emph{Advances in Neural Information Processing Systems (NeurIPS)}.

\bibitem[Chen and Soo(2018)]{chen-soo-2018-humor}
Peng-Yu Chen and Von-Wun Soo. 2018.
Humor Recognition Using Deep Learning.
In \emph{Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)}, pages 113--117, New Orleans, Louisiana.
Association for Computational Linguistics.
doi:10.18653/v1/N18-2018.

\bibitem[Moudgil(n.d.)]{moudgil_short_jokes}
Abhinav Moudgil. n.d.
\emph{Short Jokes}.
Kaggle dataset.
\url{https://www.kaggle.com/datasets/abhinavmoudgil95/short-jokes}.
Accessed: 2026-02-25.

\bibitem[Hugging Face and Wikimedia Foundation(n.d.)]{hf_wikimedia_wikipedia}
Hugging Face and Wikimedia Foundation. n.d.
\emph{wikimedia/wikipedia}.
Hugging Face Datasets.
\url{https://huggingface.co/datasets/wikimedia/wikipedia}.
Configuration used: \texttt{20231101.en}. Accessed: 2026-02-25.

\end{thebibliography}
\end{document}