@inproceedings{yacouby-axman-2020-probabilistic,
title = "Probabilistic Extension of Precision, Recall, and F1 Score for More Thorough Evaluation of Classification Models",
author = "Yacouby, Reda and
Axman, Dustin",
editor = "Eger, Steffen and
Gao, Yang and
Peyrard, Maxime and
Zhao, Wei and
Hovy, Eduard",
booktitle = "Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2020.eval4nlp-1.9/",
doi = "10.18653/v1/2020.eval4nlp-1.9",
pages = "79--91",
abstract = "In pursuit of the perfect supervised NLP classifier, razor thin margins and low-resource test sets can make modeling decisions difficult. Popular metrics such as Accuracy, Precision, and Recall are often insufficient as they fail to give a complete picture of the model`s behavior. We present a probabilistic extension of Precision, Recall, and F1 score, which we refer to as confidence-Precision (cPrecision), confidence-Recall (cRecall), and confidence-F1 (cF1) respectively. The proposed metrics address some of the challenges faced when evaluating large-scale NLP systems, specifically when the model`s confidence score assignments have an impact on the system`s behavior. We describe four key benefits of our proposed metrics as compared to their threshold-based counterparts. Two of these benefits, which we refer to as robustness to missing values and sensitivity to model confidence score assignments are self-evident from the metrics' definitions; the remaining benefits, generalization, and functional consistency are demonstrated empirically."
}
Markdown (Informal)
[Probabilistic Extension of Precision, Recall, and F1 Score for More Thorough Evaluation of Classification Models](https://preview.aclanthology.org/add-emnlp-2024-awards/2020.eval4nlp-1.9/) (Yacouby & Axman, Eval4NLP 2020)
ACL