@inproceedings{terreau-etal-2021-writing,
title = "Writing Style Author Embedding Evaluation",
author = "Terreau, Enzo and
Gourru, Antoine and
Velcin, Julien",
booktitle = "Proceedings of the 2nd Workshop on Evaluation and Comparison of NLP Systems",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.eval4nlp-1.9",
doi = "10.18653/v1/2021.eval4nlp-1.9",
pages = "84--93",
abstract = "Learning authors representations from their textual productions is now widely used to solve multiple downstream tasks, such as classification, link prediction or user recommendation. Author embedding methods are often built on top of either Doc2Vec (Mikolov et al. 2014) or the Transformer architecture (Devlin et al. 2019). Evaluating the quality of these embeddings and what they capture is a difficult task. Most articles use either classification accuracy or authorship attribution, which does not clearly measure the quality of the representation space, if it really captures what it has been built for. In this paper, we propose a novel evaluation framework of author embedding methods based on the writing style. It allows to quantify if the embedding space effectively captures a set of stylistic features, chosen to be the best proxy of an author writing style. This approach gives less importance to the topics conveyed by the documents. It turns out that recent models are mostly driven by the inner semantic of authors{'} production. They are outperformed by simple baselines, based on state-of-the-art pretrained sentence embedding models, on several linguistic axes. These baselines can grasp complex linguistic phenomena and writing style more efficiently, paving the way for designing new style-driven author embedding models.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="terreau-etal-2021-writing">
<titleInfo>
<title>Writing Style Author Embedding Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Enzo</namePart>
<namePart type="family">Terreau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antoine</namePart>
<namePart type="family">Gourru</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julien</namePart>
<namePart type="family">Velcin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Evaluation and Comparison of NLP Systems</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Learning authors representations from their textual productions is now widely used to solve multiple downstream tasks, such as classification, link prediction or user recommendation. Author embedding methods are often built on top of either Doc2Vec (Mikolov et al. 2014) or the Transformer architecture (Devlin et al. 2019). Evaluating the quality of these embeddings and what they capture is a difficult task. Most articles use either classification accuracy or authorship attribution, which does not clearly measure the quality of the representation space, if it really captures what it has been built for. In this paper, we propose a novel evaluation framework of author embedding methods based on the writing style. It allows to quantify if the embedding space effectively captures a set of stylistic features, chosen to be the best proxy of an author writing style. This approach gives less importance to the topics conveyed by the documents. It turns out that recent models are mostly driven by the inner semantic of authors’ production. They are outperformed by simple baselines, based on state-of-the-art pretrained sentence embedding models, on several linguistic axes. These baselines can grasp complex linguistic phenomena and writing style more efficiently, paving the way for designing new style-driven author embedding models.</abstract>
<identifier type="citekey">terreau-etal-2021-writing</identifier>
<identifier type="doi">10.18653/v1/2021.eval4nlp-1.9</identifier>
<location>
<url>https://aclanthology.org/2021.eval4nlp-1.9</url>
</location>
<part>
<date>2021-nov</date>
<extent unit="page">
<start>84</start>
<end>93</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Writing Style Author Embedding Evaluation
%A Terreau, Enzo
%A Gourru, Antoine
%A Velcin, Julien
%S Proceedings of the 2nd Workshop on Evaluation and Comparison of NLP Systems
%D 2021
%8 nov
%I Association for Computational Linguistics
%C Punta Cana, Dominican Republic
%F terreau-etal-2021-writing
%X Learning authors representations from their textual productions is now widely used to solve multiple downstream tasks, such as classification, link prediction or user recommendation. Author embedding methods are often built on top of either Doc2Vec (Mikolov et al. 2014) or the Transformer architecture (Devlin et al. 2019). Evaluating the quality of these embeddings and what they capture is a difficult task. Most articles use either classification accuracy or authorship attribution, which does not clearly measure the quality of the representation space, if it really captures what it has been built for. In this paper, we propose a novel evaluation framework of author embedding methods based on the writing style. It allows to quantify if the embedding space effectively captures a set of stylistic features, chosen to be the best proxy of an author writing style. This approach gives less importance to the topics conveyed by the documents. It turns out that recent models are mostly driven by the inner semantic of authors’ production. They are outperformed by simple baselines, based on state-of-the-art pretrained sentence embedding models, on several linguistic axes. These baselines can grasp complex linguistic phenomena and writing style more efficiently, paving the way for designing new style-driven author embedding models.
%R 10.18653/v1/2021.eval4nlp-1.9
%U https://aclanthology.org/2021.eval4nlp-1.9
%U https://doi.org/10.18653/v1/2021.eval4nlp-1.9
%P 84-93
Markdown (Informal)
[Writing Style Author Embedding Evaluation](https://aclanthology.org/2021.eval4nlp-1.9) (Terreau et al., Eval4NLP 2021)
ACL
- Enzo Terreau, Antoine Gourru, and Julien Velcin. 2021. Writing Style Author Embedding Evaluation. In Proceedings of the 2nd Workshop on Evaluation and Comparison of NLP Systems, pages 84–93, Punta Cana, Dominican Republic. Association for Computational Linguistics.