@inproceedings{amin-etal-2024-data,
title = "Data Augmentation for Low-Resource {I}talian {NLP}: Enhancing Semantic Processing with {DRS}",
author = "Amin, Muhammad Saad and
Anselma, Luca and
Mazzei, Alessandro",
editor = "Dell'Orletta, Felice and
Lenci, Alessandro and
Montemagni, Simonetta and
Sprugnoli, Rachele",
booktitle = "Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)",
month = dec,
year = "2024",
address = "Pisa, Italy",
publisher = "CEUR Workshop Proceedings",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.clicit-1.5/",
pages = "29--38",
ISBN = "979-12-210-7060-6",
abstract = "Discourse Representation Structure (DRS), a formal meaning representation, has shown promising results in semantic parsing and natural language generation tasks for high-resource languages like English. This paper investigates enhancing the application of DRS to low-resource Italian Natural Language Processing (NLP), in both semantic parsing (Text-to-DRS) and natural language generation (DRS-to-Text). To address the scarcity of annotated corpora for Italian DRS, we propose a novel data augmentation technique that involves the use of external linguistic resources including: (i) WordNet for common nouns, adjectives, adverbs, and verbs; (ii) LLM-generated named entities for proper nouns; and (iii) rule-based algorithms fortense augmentation. This approach not only increases the quantity of training data but also introduces linguistic diversity, which is crucial for improving model performance and robustness. Using this augmented dataset, we developed neural semantic parser and generator models that demonstrated enhanced generalization ability compared to models trained on non-augmented data. We evaluated the effect of semantic data augmentation using two state-of-the-art transformer-based neural sequence-to-sequence models, i.e., byT5 and IT5. Our implementation shows promising results for Italian semanticprocessing. Data augmentation significantly increased the performance of semantic parsing from 76.10 to 90.56 (+14.46{\%}) F1-SMATCH score and generation with 37.79 to 57.48 (+19.69{\%}) BLEU, 30.83 to 40.95 (+10.12{\%}) METEOR, 81.66 to 90.97 (+9.31{\%}) COMET, 54.84 to 70.88 (+16.04{\%}) chrF, and 88.86 to 92.97 (+4.11{\%}) BERT scores. These results demonstrate the effectiveness of our novel augmentation approach in enhancing semantic processing capabilities for low-resource languages like Italian."
}
Markdown (Informal)
[Data Augmentation for Low-Resource Italian NLP: Enhancing Semantic Processing with DRS](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.clicit-1.5/) (Amin et al., CLiC-it 2024)
ACL