@inproceedings{fatima-2025-proactive,
title = "A Proactive Reliability Metric for Detecting Failures in Language Model Training",
author = "Fatima, Maryam",
editor = "Potdar, Saloni and
Rojas-Barahona, Lina and
Montella, Sebastien",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2025",
address = "Suzhou (China)",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/dashboard/2025.emnlp-industry.193/",
pages = "2897--2913",
ISBN = "979-8-89176-333-3",
abstract = "Training large language models (LLMs) at scale is fraught with instabilities that can lead to catastrophic failures, wasting millions of dollars in compute resources. Current approaches rely on reactive interventions like checkpointing, which only mitigate failures after detection. We introduce the R-Metric, a proactive reliability metric that combines signals from hardware monitoring ($\lambda$), training dynamics ($\sigma^2$), and model performance ($\Delta L$) to predict failures before they occur. Through extensive experiments across 720 simulated runs and real-world validation on diverse hardware (NVIDIA T4/L4 GPUs) and model architectures (Llama 3.2-1B, GPT-2 Large, Qwen3-0.6B, Liquid AI LFM2-700M), we demonstrate that the R-Metric achieves 0.973 F1-Score in simulation and perfect 1.00 F1-Score in real-world deployment with an average lead time of 255 steps (12.8 minutes for small models, scaling to 2-8 minutes at production training speeds), enabling preemptive intervention. Importantly, our optimized weights ($\lambda$=0.10, $\sigma^2$=0.45, $\Delta L$=0.70) transfer across architectures with less than 3{\%} performance degradation, eliminating expensive retuning. The metric{'}s lightweight computational overhead (1.8{\%} training time increase) makes it immediately deployable for resource-constrained organizations{---}academic labs, startups, and open-source communities{---}democratizing access to enterprise-grade reliability monitoring."
}Markdown (Informal)
[A Proactive Reliability Metric for Detecting Failures in Language Model Training](https://preview.aclanthology.org/dashboard/2025.emnlp-industry.193/) (Fatima, EMNLP 2025)
ACL