@inproceedings{caplan-etal-2026-splits,
title = "Splits! Flexible Sociocultural Linguistic Investigation at Scale",
author = "Caplan, Eylon and
Chakraborty, Tania and
Goldwasser, Dan",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.117/",
pages = "2526--2550",
ISBN = "979-8-89176-390-6",
abstract = "Variation in language use, shaped by speakers' sociocultural background and specific context of use, offers a rich lens into cultural perspectives, values, and opinions. For example, Chinese students discuss *healthy eating* with words like *timing*, *regularity*, and *digestion*, whereas Americans use vocabulary like *balancing food groups* and *avoiding fat and sugar*, reflecting distinct cultural models of nutrition (Banna et al., 2016). The computational study of these Sociocultural Linguistic Phenomena (SLP) has traditionally been done in NLP via tailored analyses of specific groups or topics, requiring specialized data collection and experimental operationalization{---}a process not well-suited to quick hypothesis exploration and prototyping. To address this, we propose constructing a ``sandbox'' designed for systematic and flexible sociolinguistic research. Using our method, we construct a demographically/topically split Reddit dataset, **Splits!**, validated by self-identification and by replicating several known SLPs from existing literature. We showcase the sandbox{'}s utility with a scalable, two-stage process that filters large collections of *potential* SLPs (PSLPs) to surface the most promising candidates for deeper, qualitative investigation."
}Markdown (Informal)
[Splits! Flexible Sociocultural Linguistic Investigation at Scale](https://preview.aclanthology.org/ingest-acl/2026.acl-long.117/) (Caplan et al., ACL 2026)
ACL
- Eylon Caplan, Tania Chakraborty, and Dan Goldwasser. 2026. Splits! Flexible Sociocultural Linguistic Investigation at Scale. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 2526–2550, San Diego, California, United States. Association for Computational Linguistics.