@inproceedings{nguyen-etal-2024-ceval-benchmark,
title = "{CE}val: A Benchmark for Evaluating Counterfactual Text Generation",
author = {Nguyen, Van Bach and
Seifert, Christin and
Schl{\"o}tterer, J{\"o}rg},
editor = "Mahamood, Saad and
Minh, Nguyen Le and
Ippolito, Daphne",
booktitle = "Proceedings of the 17th International Natural Language Generation Conference",
month = sep,
year = "2024",
address = "Tokyo, Japan",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.inlg-main.6/",
pages = "55--69",
abstract = "Counterfactual text generation aims to minimally change a text, such that it is classified differently. Assessing progress in method development for counterfactual text generation is hindered by a non-uniform usage of data sets and metrics in related work. We propose CEval, a benchmark for comparing counterfactual text generation methods. CEval unifies counterfactual and text quality metrics, includes common counterfactual datasets with human annotations, standard baselines (MICE, GDBA, CREST) and the open-source language model LLAMA-2. Our experiments found no perfect method for generating counterfactual text. Methods that excel at counterfactual metrics often produce lower-quality text while LLMs with simple prompts generate high-quality text but struggle with counterfactual criteria. By making CEval available as an open-source Python library, we encourage the community to contribute additional methods and maintain consistent evaluation in future work."
}
Markdown (Informal)
[CEval: A Benchmark for Evaluating Counterfactual Text Generation](https://preview.aclanthology.org/fix-sig-urls/2024.inlg-main.6/) (Nguyen et al., INLG 2024)
ACL