@inproceedings{mussmann-etal-2020-importance,
title = "{O}n the {I}mportance of {A}daptive {D}ata {C}ollection for {E}xtremely {I}mbalanced {P}airwise {T}asks",
author = "Mussmann, Stephen and
Jia, Robin and
Liang, Percy",
editor = "Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2020.findings-emnlp.305/",
doi = "10.18653/v1/2020.findings-emnlp.305",
pages = "3400--3413",
abstract = "Many pairwise classification tasks, such as paraphrase detection and open-domain question answering, naturally have extreme label imbalance (e.g., 99.99{\%} of examples are negatives). In contrast, many recent datasets heuristically choose examples to ensure label balance. We show that these heuristics lead to trained models that generalize poorly: State-of-the art models trained on QQP and WikiQA each have only 2.4{\%} average precision when evaluated on realistically imbalanced test data. We instead collect training data with active learning, using a BERT-based embedding model to efficiently retrieve uncertain points from a very large pool of unlabeled utterance pairs. By creating balanced training data with more informative negative examples, active learning greatly improves average precision to 32.5{\%} on QQP and 20.1{\%} on WikiQA."
}
Markdown (Informal)
[On the Importance of Adaptive Data Collection for Extremely Imbalanced Pairwise Tasks](https://preview.aclanthology.org/fix-sig-urls/2020.findings-emnlp.305/) (Mussmann et al., Findings 2020)
ACL