@inproceedings{corbeil-abdi-ghavidel-2021-assessing,
title = "Assessing the Eligibility of Backtranslated Samples Based on Semantic Similarity for the Paraphrase Identification Task",
author = "Corbeil, Jean-Philippe and
Abdi Ghavidel, Hadi",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)",
month = sep,
year = "2021",
address = "Held Online",
publisher = "INCOMA Ltd.",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.ranlp-1.35/",
pages = "301--308",
abstract = "In the domain of natural language augmentation, the eligibility of generated samples remains not well understood. To gather insights around this eligibility issue, we apply a transformer-based similarity calculation within the BET framework based on backtranslation, in the context of automated paraphrase detection. While providing a rigorous statistical foundation to BET, we push their results by analyzing statistically the impacts of the level of qualification, and several sample sizes. We conducted a vast amount of experiments on the MRPC corpus using six pre-trained models: BERT, XLNet, Albert, RoBERTa, Electra, and DeBerta. We show that our method improves significantly these ``base'' models while using only a fraction of the corpus. Our results suggest that using some of those smaller pre-trained models, namely RoBERTa base and Electra base, helps us reach F1 scores very close to their large counterparts, as reported on the GLUE benchmark. On top of acting as a regularizer, the proposed method is efficient in dealing with data scarcity with improvements of around 3{\%} in F1 score for most pre-trained models, and more than 7.5{\%} in the case of Electra."
}
Markdown (Informal)
[Assessing the Eligibility of Backtranslated Samples Based on Semantic Similarity for the Paraphrase Identification Task](https://preview.aclanthology.org/fix-sig-urls/2021.ranlp-1.35/) (Corbeil & Abdi Ghavidel, RANLP 2021)
ACL