@inproceedings{goodman-2026-inferring,
title = "Inferring Student Engagement via Real-Time Thermal{--}Visual Voice Activity Detection",
author = "Goodman, Bradley",
editor = "Kochmar, Ekaterina and
Alhafni, Bashar and
Bann{\`o}, Stefano and
Bexte, Marie and
Burstein, Jill and
Horbach, Andrea and
Laarmann-Quante, Ronja and
Tack, Anais and
Yaneva, Victoria and
Yuan, Zheng",
booktitle = "Proceedings of the 21st Workshop on Innovative Use of {NLP} for Building Educational Applications ({BEA} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.bea-1.2/",
pages = "7--20",
ISBN = "979-8-89176-409-5",
abstract = "We introduce a thermal{--}visual fusion approach to improve non-invasive Voice Activity Detection (VAD) for classroom engagement monitoring. In noisy multi-speaker classrooms using a single microphone, acoustic-only methods fail to reliably isolate individual speakers. Our method integrates facial thermal signatures{---}capturing respiratory and speech-related heat patterns{---}with visual lip-motion cues to provide an acoustic-independent speech signal. This provides a localized, privacy-preserving, and acoustic-independent indicator of speech activity.This system acts as a visual-diarization frontend, informing Automatic Speech Recognition (ASR) and Natural Language Processing (NLP) systems not only when speech occurs, but precisely which student is speaking. Using up to 19 engineered features, our Thermal-Only Random Forest classifier achieved a Recall of 0.9234 and an F1-score of 0.8105 in subject-independent evaluations, outperforming visual-only baselines. The system was validated as a proof-of-concept on a Raspberry Pi 5 in a controlled laboratory setting, demonstrating real-time feasibility. These results demonstrate that thermal{--}visual fusion enables more reliable linguistic analysis of collaborative learning and provide critical input for AI agents to facilitate group participation in real-world educational settings that lead to more successful learning outcomes."
}Markdown (Informal)
[Inferring Student Engagement via Real-Time Thermal–Visual Voice Activity Detection](https://preview.aclanthology.org/ingest-acl-workshops/2026.bea-1.2/) (Goodman, BEA 2026)
ACL