@article{rieser-2024-multi,
title = "Multi-modal Anaphora and Broadcasting of Information by Gestural Post-holds",
author = "Rieser, Hannes",
editor = "Li, Junyi Jessy and
Stede, Manfred and
Zeldes, Amir and
Ginzburg, Jonathan and
Georgila, Kallirroi and
Traum, David",
journal = "Dialogue {\&} Discourse",
volume = "15",
month = sep,
year = "2024",
address = "Chicago, Illinois, USA",
publisher = "University of Illinois Chicago",
url = "https://preview.aclanthology.org/transition-to-json/2024.dnd-15.3/",
doi = "10.5210/dad.2024.202",
pages = "36--84",
abstract = "This paper deals with three interrelated topics, linguistic anaphora, multi-modal anaphora and the top-down broadcasting of information using gestural post-holds in multimodal dialogue. Initially, a new solution for definite, pronominal and pro-adverbial anaphora is given based on the idea that an existentially quantified general term may output a definite reference. This approach is extended to multimodal anaphora, where part or all of an anaphor{'}s meaning is contributed by some sequence of iconic or deictic gestures. Anaphora exploit the semantic potential of their antecedents, they work, as tradition has it, ``bottom-up''. An inverse relation, more general than cataphora, and investigated here for the first time, is ``broadcasting'', where information is freely distributed top down and input to receiving sites (ports). Anaphora are modelled with the same top-down mechanism and the same applies for coherence relations in dialogue which generally show an anaphora-like behaviour. ``Broadcasting'' can be used in the context of anaphors, for example, to provide their gestural meaning parts but also for a verb{'}s multi-modal arguments for referring to a location, a direction or an area. As to multi-modal data, broadcasting is shown to be frequently tied up with gestural post-holds, the holding of a gesture{'}s stroke information independently of semantically alignable speech. This leads to considering post-holds from a new perspective, stressing their speech-independent function and their relevance for indicating topic-continuity. We show that multi-modal anaphora and especially broadcasting cross single contributions and turns. The data which let us develop these perspectives come from the SaGA (Speech and Gesture Alignment) corpus, a set of route-description dialogues generated in a VR-setting incorporating marker-based eye-tracking facilities. The calculus used to model the anaphora and broadcasting dynamics is the concurrent {\ensuremath{\lambda}}{\ensuremath{\Psi}}-calculus, a recently developed two-tiered machinery using a {\ensuremath{\Psi}}-calculus for input-output, data transport and broadcasting. The data transported are in a typed {\ensuremath{\lambda}}-calculus format incorporating Neo-Davidsonian representations; these data can be linguistic, gestural only or multi-modal. Multi-modal informational chunks are modelled as communicating agents sending and receiving information via input-output-channels. They are introduced incrementally on an empirically motivated construction or gesture-plus-construction or gesture only basis. The {\ensuremath{\lambda}}{\ensuremath{\Psi}}-calculus is also used for the multi-modal fusion component unifying gestural and linguistic information; hence, the paper is also a contribution to multi-modal fusion of linguistic and gestural input. Finally, it is shown how the presented algorithm can capture multi-modal coherence relations or a multi-modal anaphora resolution based on PTT ideas."
}