@inproceedings{rice-etal-2025-egodrive,
title = "{E}go{D}rive: Egocentric Multimodal Driver Behavior Recognition Using Project Aria",
author = "Rice, Michael and
Krause, Lorenz and
Qureshi, Waqar Shahid",
editor = "Acarturk, Cengiz and
Nasir, Jamal and
Can, Burcu and
Coltekin, Cagr{\i}",
booktitle = "Proceedings of the First International Workshop on Gaze Data and Natural Language Processing",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, BULGARIA",
url = "https://preview.aclanthology.org/corrections-2026-01/2025.gaze4nlp-1.3/",
pages = "18--25",
abstract = "Egocentric sensing using wearable devices of- fers a unique first-person perspective for driver behavior analysis and monitoring, with the po- tential to accurately capture rich multimodal cues such as eye gaze, head motion, and hand activity directly from the driver{'}s view- point. In this paper, we introduce a multimodal driver behavior recognition framework utilizing Meta{'}s Project Aria smart glasses, along with a novel, synchronized egocentric driving dataset comprising high-resolution RGB video, gaze- tracking data, inertial IMU signals, hand pose landmarks, and YOLO-based semantic object detections. All sensor data streams are tempo- rally aligned and segmented into fixed-length clips, each manually annotated with one of six distinct driver behavior classes: Driving, Left Mirror Check, Right Wing Mirror Check, Rear- view Mirror Check, Mobile Phone Usage, and Idle. We design a Transformer-based recog- nition framework in which each modality is processed by a specialized encoder and then fused via Temporal Transformer layers to cap- ture cross-modal temporal dependencies. To in- vestigate the trade-off between accuracy and ef- ficiency for real-time deployment, we introduce two model variants: EgoDriveMax, optimized for maximum accuracy, and EgoDriveRT, de- signed for real-time performance. These mod- els achieve classification accuracies of 98.6{\%} and 97.4{\%} respectively. Notably, EgoDriveRT delivers strong performance despite operating with only 104K parameters and requiring just 2.65 ms per inference without the use of a spe- cialized GPU{---}highlighting its potential for efficient, real-time in-cabin driver monitoring."
}Markdown (Informal)
[EgoDrive: Egocentric Multimodal Driver Behavior Recognition Using Project Aria](https://preview.aclanthology.org/corrections-2026-01/2025.gaze4nlp-1.3/) (Rice et al., Gaze4NLP 2025)
ACL