@inproceedings{lai-etal-2020-diversity,
title = "Diversity, Density, and Homogeneity: Quantitative Characteristic Metrics for Text Collections",
author = "Lai, Yi-An and
Zhu, Xuan and
Zhang, Yi and
Diab, Mona",
booktitle = "Proceedings of the 12th Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.215",
pages = "1739--1746",
abstract = "Summarizing data samples by quantitative measures has a long history, with descriptive statistics being a case in point. However, as natural language processing methods flourish, there are still insufficient characteristic metrics to describe a collection of texts in terms of the words, sentences, or paragraphs they comprise. In this work, we propose metrics of diversity, density, and homogeneity that quantitatively measure the dispersion, sparsity, and uniformity of a text collection. We conduct a series of simulations to verify that each metric holds desired properties and resonates with human intuitions. Experiments on real-world datasets demonstrate that the proposed characteristic metrics are highly correlated with text classification performance of a renowned model, BERT, which could inspire future applications.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lai-etal-2020-diversity">
<titleInfo>
<title>Diversity, Density, and Homogeneity: Quantitative Characteristic Metrics for Text Collections</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yi-An</namePart>
<namePart type="family">Lai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuan</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mona</namePart>
<namePart type="family">Diab</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Language Resources and Evaluation Conference</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>Summarizing data samples by quantitative measures has a long history, with descriptive statistics being a case in point. However, as natural language processing methods flourish, there are still insufficient characteristic metrics to describe a collection of texts in terms of the words, sentences, or paragraphs they comprise. In this work, we propose metrics of diversity, density, and homogeneity that quantitatively measure the dispersion, sparsity, and uniformity of a text collection. We conduct a series of simulations to verify that each metric holds desired properties and resonates with human intuitions. Experiments on real-world datasets demonstrate that the proposed characteristic metrics are highly correlated with text classification performance of a renowned model, BERT, which could inspire future applications.</abstract>
<identifier type="citekey">lai-etal-2020-diversity</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.215</url>
</location>
<part>
<date>2020-may</date>
<extent unit="page">
<start>1739</start>
<end>1746</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Diversity, Density, and Homogeneity: Quantitative Characteristic Metrics for Text Collections
%A Lai, Yi-An
%A Zhu, Xuan
%A Zhang, Yi
%A Diab, Mona
%S Proceedings of the 12th Language Resources and Evaluation Conference
%D 2020
%8 may
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F lai-etal-2020-diversity
%X Summarizing data samples by quantitative measures has a long history, with descriptive statistics being a case in point. However, as natural language processing methods flourish, there are still insufficient characteristic metrics to describe a collection of texts in terms of the words, sentences, or paragraphs they comprise. In this work, we propose metrics of diversity, density, and homogeneity that quantitatively measure the dispersion, sparsity, and uniformity of a text collection. We conduct a series of simulations to verify that each metric holds desired properties and resonates with human intuitions. Experiments on real-world datasets demonstrate that the proposed characteristic metrics are highly correlated with text classification performance of a renowned model, BERT, which could inspire future applications.
%U https://aclanthology.org/2020.lrec-1.215
%P 1739-1746
Markdown (Informal)
[Diversity, Density, and Homogeneity: Quantitative Characteristic Metrics for Text Collections](https://aclanthology.org/2020.lrec-1.215) (Lai et al., LREC 2020)
ACL