@inproceedings{galyukshev-alimova-2026-processing,
title = "Processing Inconsistency Predicts Language Competence: {LLM} Evaluation Without Answer Labels on {T}urkic Languages",
author = "Galyukshev, Ilya and
Alimova, Ilseyar",
editor = "T.Y.S.S., Santosh and
Rodriguez, Juan Diego and
de Gibert, Ona",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-srw.94/",
pages = "1074--1086",
ISBN = "979-8-89176-393-7",
abstract = "Most languages lack labeled evaluation benchmarks for large language models (LLMs). Creating such benchmarks requires native speakers, domain expertise, and answer annotation{---}resources unavailable for the vast majority of languages. We investigate whether a model{'}s internal processing signals{---}such as generation entropy and tokenizer statistics{---}correlate with its actual accuracy on a language, with the long-term goal of estimating language competence without labeled data. Our key observation is that for languages a model does not know, both tokenizer segmentation and generation entropy become highly variable across questions, whereas for known languages they remain consistent. We call this the *inconsistency hypothesis* and test it on 11 instruction-tuned LLMs (1B{--}70B parameters) across 14 language{--}script varieties (12 Turkic plus English and Russian controls). We extract over 25 processing features per model{--}language pair; individually, even the strongest correlate only moderately with accuracy (Pearson $|r|$ up to 0.55). Yet combining just three complementary features{---}a tokenizer coverage ratio, entropy variability, and the model{'}s English/Russian benchmark score{---}explains 75{\%} of accuracy variance in leave-one-language-out evaluation, nearly doubling the 44{\%} explained by a model-mean baseline. The variability of processing signals (standard deviation) consistently outperforms mean values as a predictor across all five model families, but only for greedy-pass measures; sampling-based measures show no such pattern."
}Markdown (Informal)
[Processing Inconsistency Predicts Language Competence: LLM Evaluation Without Answer Labels on Turkic Languages](https://preview.aclanthology.org/ingest-acl/2026.acl-srw.94/) (Galyukshev & Alimova, ACL 2026)
ACL