@article{pattichis-etal-2026-evaluating,
title = "Re-evaluating the Word Token for Bilingual Speech Processing: The Case for Intonation Units",
author = "Pattichis, Rebecca and
LaCasse, Dora and
Cacoullos, Rena Torres",
journal = "Computational Linguistics",
volume = "52",
number = "1",
month = mar,
year = "2026",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://preview.aclanthology.org/ingest-latest-mitpress-cl-tacl/2026.cl-1.8/",
doi = "10.1162/coli.a.580",
pages = "271--293",
abstract = "Natural Language Processing (NLP) metrics for bilingual code-switching (CS) have, until now, used words as the token level. However, the assumption that any two words constitute an equally likely switch point is erroneous. In spoken language, a major delimiter of CS is a prosodic chunk known as the Intonation Unit (IU). Switch points are far more likely between words at IU boundaries than between words in the same IU. The word as an elementary NLP unit is thus incommensurate with bilingual speech patterns. Here, we put forward an IU-based adaptation of a familiar metric of CS probability. We then compare the token levels on this metric for ten bilingual datasets featuring multi-word CS. Our comparison shows that the currently standard two-significant-figure precision of the word-based metric is insufficient, as the token level compresses the range of values by inflating the universe of CS. More discerning CS probability values can be obtained by normalizing word-based counts using mean IU length."
}Markdown (Informal)
[Re-evaluating the Word Token for Bilingual Speech Processing: The Case for Intonation Units](https://preview.aclanthology.org/ingest-latest-mitpress-cl-tacl/2026.cl-1.8/) (Pattichis et al., CL 2026)
ACL