@inproceedings{saad-etal-2025-hu,
title = "{HU} at {S}em{E}val-2025 Task 9: Leveraging {LLM}-Based Data Augmentation for Class Imbalance",
author = "Saad, Muhammad and
Abbas, Meesum and
Kumar, Sandesh and
Samad, Abdul",
editor = "Rosenthal, Sara and
Ros{\'a}, Aiala and
Ghosh, Debanjan and
Zampieri, Marcos",
booktitle = "Proceedings of the 19th International Workshop on Semantic Evaluation (SemEval-2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/transition-to-people-yaml/2025.semeval-1.210/",
pages = "1593--1601",
ISBN = "979-8-89176-273-2",
abstract = "This paper presents a solution to the food hazard detection challenge in the SemEval-2025 Task 9, focusing on overcoming class imbalance using data augmentation techniques. We employ large language models (LLMs) like GPT-4o, Gemini Flash 1.5, and T5 to generate synthetic data, alongside other methods like synonym replacement, back-translation, and paraphrasing. These augmented datasets are used to fine-tune transformer-based models such as DistilBERT, improving their performance in detecting food hazards and categorizing products. Our approach achieves notable improvements in macro-F1 scores for both subtasks, although challenges remain in detecting implicit hazards and handling extreme class imbalance. The paper also discusses various techniques, including class weighting and ensemble modeling, as part of the training process. Despite the improvements, further work is necessary to refine hazard detection, particularly for rare and implicit categories."
}
Markdown (Informal)
[HU at SemEval-2025 Task 9: Leveraging LLM-Based Data Augmentation for Class Imbalance](https://preview.aclanthology.org/transition-to-people-yaml/2025.semeval-1.210/) (Saad et al., SemEval 2025)
ACL