@inproceedings{haagsma-etal-2020-magpie,
title = "{MAGPIE}: A Large Corpus of Potentially Idiomatic Expressions",
author = "Haagsma, Hessel and
Bos, Johan and
Nissim, Malvina",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://preview.aclanthology.org/Author-Pages-WenzhengZhang-ZhengyanShi-ShuYang/2020.lrec-1.35/",
pages = "279--287",
language = "eng",
ISBN = "979-10-95546-34-4",
abstract = "Given the limited size of existing idiom corpora, we aim to enable progress in automatic idiom processing and linguistic analysis by creating the largest-to-date corpus of idioms for English. Using a fixed idiom list, automatic pre-extraction, and a strictly controlled crowdsourced annotation procedure, we show that it is feasible to build a high-quality corpus comprising more than 50K instances, an order of a magnitude larger than previous resources. Crucial ingredients of crowdsourcing were the selection of crowdworkers, clear and comprehensive instructions, and an interface that breaks down the task in small, manageable steps. Analysis of the resulting corpus revealed strong effects of genre on idiom distribution, providing new evidence for existing theories on what influences idiom usage. The corpus also contains rich metadata, and is made publicly available."
}
Markdown (Informal)
[MAGPIE: A Large Corpus of Potentially Idiomatic Expressions](https://preview.aclanthology.org/Author-Pages-WenzhengZhang-ZhengyanShi-ShuYang/2020.lrec-1.35/) (Haagsma et al., LREC 2020)
ACL