@inproceedings{abdul-mageed-etal-2021-mega,
title = "Mega-{COV}: A Billion-Scale Dataset of 100+ Languages for {COVID}-19",
author = "Abdul-Mageed, Muhammad and
Elmadany, AbdelRahim and
Nagoudi, El Moatez Billah and
Pabbi, Dinesh and
Verma, Kunal and
Lin, Rannie",
booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume",
month = apr,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.eacl-main.298",
doi = "10.18653/v1/2021.eacl-main.298",
pages = "3402--3420",
abstract = "We describe Mega-COV, a billion-scale dataset from Twitter for studying COVID-19. The dataset is diverse (covers 268 countries), longitudinal (goes as back as 2007), multilingual (comes in 100+ languages), and has a significant number of location-tagged tweets ({\textasciitilde}169M tweets). We release tweet IDs from the dataset. We also develop two powerful models, one for identifying whether or not a tweet is related to the pandemic (best F1=97{\%}) and another for detecting misinformation about COVID-19 (best F1=92{\%}). A human annotation study reveals the utility of our models on a subset of Mega-COV. Our data and models can be useful for studying a wide host of phenomena related to the pandemic. Mega-COV and our models are publicly available.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="abdul-mageed-etal-2021-mega">
<titleInfo>
<title>Mega-COV: A Billion-Scale Dataset of 100+ Languages for COVID-19</title>
</titleInfo>
<name type="personal">
<namePart type="given">Muhammad</namePart>
<namePart type="family">Abdul-Mageed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">AbdelRahim</namePart>
<namePart type="family">Elmadany</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">El</namePart>
<namePart type="given">Moatez</namePart>
<namePart type="given">Billah</namePart>
<namePart type="family">Nagoudi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dinesh</namePart>
<namePart type="family">Pabbi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kunal</namePart>
<namePart type="family">Verma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rannie</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-apr</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We describe Mega-COV, a billion-scale dataset from Twitter for studying COVID-19. The dataset is diverse (covers 268 countries), longitudinal (goes as back as 2007), multilingual (comes in 100+ languages), and has a significant number of location-tagged tweets ( 169M tweets). We release tweet IDs from the dataset. We also develop two powerful models, one for identifying whether or not a tweet is related to the pandemic (best F1=97%) and another for detecting misinformation about COVID-19 (best F1=92%). A human annotation study reveals the utility of our models on a subset of Mega-COV. Our data and models can be useful for studying a wide host of phenomena related to the pandemic. Mega-COV and our models are publicly available.</abstract>
<identifier type="citekey">abdul-mageed-etal-2021-mega</identifier>
<identifier type="doi">10.18653/v1/2021.eacl-main.298</identifier>
<location>
<url>https://aclanthology.org/2021.eacl-main.298</url>
</location>
<part>
<date>2021-apr</date>
<extent unit="page">
<start>3402</start>
<end>3420</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mega-COV: A Billion-Scale Dataset of 100+ Languages for COVID-19
%A Abdul-Mageed, Muhammad
%A Elmadany, AbdelRahim
%A Nagoudi, El Moatez Billah
%A Pabbi, Dinesh
%A Verma, Kunal
%A Lin, Rannie
%S Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume
%D 2021
%8 apr
%I Association for Computational Linguistics
%C Online
%F abdul-mageed-etal-2021-mega
%X We describe Mega-COV, a billion-scale dataset from Twitter for studying COVID-19. The dataset is diverse (covers 268 countries), longitudinal (goes as back as 2007), multilingual (comes in 100+ languages), and has a significant number of location-tagged tweets ( 169M tweets). We release tweet IDs from the dataset. We also develop two powerful models, one for identifying whether or not a tweet is related to the pandemic (best F1=97%) and another for detecting misinformation about COVID-19 (best F1=92%). A human annotation study reveals the utility of our models on a subset of Mega-COV. Our data and models can be useful for studying a wide host of phenomena related to the pandemic. Mega-COV and our models are publicly available.
%R 10.18653/v1/2021.eacl-main.298
%U https://aclanthology.org/2021.eacl-main.298
%U https://doi.org/10.18653/v1/2021.eacl-main.298
%P 3402-3420
Markdown (Informal)
[Mega-COV: A Billion-Scale Dataset of 100+ Languages for COVID-19](https://aclanthology.org/2021.eacl-main.298) (Abdul-Mageed et al., EACL 2021)
ACL
- Muhammad Abdul-Mageed, AbdelRahim Elmadany, El Moatez Billah Nagoudi, Dinesh Pabbi, Kunal Verma, and Rannie Lin. 2021. Mega-COV: A Billion-Scale Dataset of 100+ Languages for COVID-19. In Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume, pages 3402–3420, Online. Association for Computational Linguistics.