@inproceedings{muller-2026-semantic-factor,
title = "Semantic Factor Analysis: Validating Personality Structure Recovery from empirically-mediated Word Embeddings",
author = {M{\"u}ller, Oliver},
editor = "Alves, Diego and
Bizzoni, Yuri and
Degaetano-Ortlieb, Stefania and
Kazantseva, Anna and
Pagel, Janis and
Szpakowicz, Stan",
booktitle = "Proceedings of the 10th Joint {SIGHUM} Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.latechclfl-1.17/",
pages = "176--188",
ISBN = "979-8-89176-373-9",
abstract = "The present study introduces Semantic Factor Analysis (SFA), a novel computational approach recovering Big Five personality trait structures from pre-trained adjective word embeddings weighted by empirical participant data. Using Word2Vec embeddings trained on the Google-News-300 corpus, semantic relationships of IPIP-50 Big Five inventory adjectives (Goldberg, 1992) were extracted and factor structures computed through weighted vector averaging and K-means clustering. To validate the methodology, SFA was compared against a baseline using unweighted Word2Vec embeddings. In a controlled experiment with n=55 participants completing standard IPIP-50 assessments, HSP-R scale (Pluess et al., 2024) and multimedia impact surveys, empirically-weighted SFA successfully recovered all five personality dimensions with 62.5{\%} average factor purity, substantially outperforming the unweighted baseline (52.0{\%}, 10{\%} relative improvement), while traditional Confirmatory Factor Analysis showed factor collapse and poor model fit. The approach was validated through Latent Class Analysis deriving empirically-based classification thresholds for Big Five dimensions and supporting a trichotomous Environmental Sensitivity model (Lionetti et al., 2018). Results demonstrate that integrating semantic representations with empirical data improves Big Five structure recovery beyond pure semantic similarity alone, particularly for small sample studies where traditional methods such as CFA will fail due to limited empirical data points."
}Markdown (Informal)
[Semantic Factor Analysis: Validating Personality Structure Recovery from empirically-mediated Word Embeddings](https://preview.aclanthology.org/ingest-eacl/2026.latechclfl-1.17/) (Müller, LaTeCH-CLfL 2026)
ACL