@inproceedings{laban-etal-2021-transformer,
title = "Can Transformer Models Measure Coherence In Text: Re-Thinking the Shuffle Test",
author = "Laban, Philippe and
Dai, Luke and
Bandarkar, Lucas and
Hearst, Marti A.",
editor = "Zong, Chengqing and
Xia, Fei and
Li, Wenjie and
Navigli, Roberto",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2021.acl-short.134/",
doi = "10.18653/v1/2021.acl-short.134",
pages = "1058--1064",
abstract = "The Shuffle Test is the most common task to evaluate whether NLP models can measure coherence in text. Most recent work uses direct supervision on the task; we show that by simply finetuning a RoBERTa model, we can achieve a near perfect accuracy of 97.8{\%}, a state-of-the-art. We argue that this outstanding performance is unlikely to lead to a good model of text coherence, and suggest that the Shuffle Test should be approached in a Zero-Shot setting: models should be evaluated without being trained on the task itself. We evaluate common models in this setting, such as Generative and Bi-directional Transformers, and find that larger architectures achieve high-performance out-of-the-box. Finally, we suggest the k-Block Shuffle Test, a modification of the original by increasing the size of blocks shuffled. Even though human reader performance remains high (around 95{\%} accuracy), model performance drops from 94{\%} to 78{\%} as block size increases, creating a conceptually simple challenge to benchmark NLP models."
}
Markdown (Informal)
[Can Transformer Models Measure Coherence In Text: Re-Thinking the Shuffle Test](https://preview.aclanthology.org/add-emnlp-2024-awards/2021.acl-short.134/) (Laban et al., ACL-IJCNLP 2021)
ACL
- Philippe Laban, Luke Dai, Lucas Bandarkar, and Marti A. Hearst. 2021. Can Transformer Models Measure Coherence In Text: Re-Thinking the Shuffle Test. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers), pages 1058–1064, Online. Association for Computational Linguistics.