@inproceedings{rice-etal-2025-egodrive,
title = "{E}go{D}rive: Egocentric Multimodal Driver Behavior Recognition Using Project Aria",
author = "Rice, Michael and
Krause, Lorenz and
Qureshi, Waqar Shahid",
editor = "Acarturk, Cengiz and
Nasir, Jamal and
Can, Burcu and
Coltekin, Cagr{\i}",
booktitle = "Proceedings of the First International Workshop on Gaze Data and Natural Language Processing",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, BULGARIA",
url = "https://preview.aclanthology.org/fix___bootstrap-utility-classes/2025.gaze4nlp-1.3/",
pages = "18--25",
abstract = "Egocentric sensing using wearable devices offers a unique first-person perspective for driver behaviour analysis and monitoring, with the potential to accurately capture rich multimodal cues such as eye gaze, head motion, and hand activity directly from the driver{'}s viewpoint. In this paper, we introduce a multimodal driver behaviour recognition framework utilizing Meta{'}s Project Aria smart glasses, along with a novel, synchronized egocentric driving dataset comprising high-resolution Red Green Blue (RGB) video, gaze-tracking data, Inertial Measurement Unit (IMU) signals, hand pose landmarks, and YOLO-based semantic object detections. All sensor data streams are temporally aligned and segmented into fixed-length clips, each manually annotated with one of six distinct driver behavior classes: \textit{Driving}, \textit{Left Mirror Check}, \textit{Right Wing Mirror Check}, \textit{Rear-view Mirror Check}, \textit{Mobile Phone Usage}, and \textit{Idle}. We design a Transformer-based recognition framework in which each modality is processed by a specialized encoder and then fused via Temporal Transformer layers to capture cross-modal temporal dependencies. To investigate the trade-off between accuracy and efficiency for real-time deployment, we introduce two model variants: EgoDriveMax, optimized for maximum accuracy, and EgoDriveRT, designed for real-time performance. These models achieve classification accuracies of 98.6{\%} and 97.4{\%} respectively. Notably, EgoDriveRT delivers strong performance despite operating with only 104K parameters and requiring just 2.65 ms per inference without the use of a specialized GPU{---}highlighting its potential for efficient, real-time in-cabin driver monitoring."
}Markdown (Informal)
[EgoDrive: Egocentric Multimodal Driver Behavior Recognition Using Project Aria](https://preview.aclanthology.org/fix___bootstrap-utility-classes/2025.gaze4nlp-1.3/) (Rice et al., Gaze4NLP 2025)
ACL