@inproceedings{gershuni-shmidman-2026-human,
title = "Human-{AI} Annotation Error Auditing for {H}ebrew Diacritization with Frontier {LLM}s",
author = "Gershuni, Hillel and
Shmidman, Avi",
editor = "Liu, Yang Janet and
Gessler, Luke",
booktitle = "Proceedings of the 20th Linguistic Annotation Workshop ({LAW} {XX})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.law-main.4/",
pages = "33--46",
ISBN = "979-8-89176-404-0",
abstract = "Large annotated datasets inevitably contain errors that are costly to identify via manual review. We study a human-AI annotation error auditing workflow using frontier Large Language Models (LLMs), focusing on Hebrew \textit{nikud} (diacritization). We take the the EACL 2023 Hebrew Homograph Challenge Set as our test case. In a focused evaluation on 12 of the homograph sets with 271 confirmed errors (verified through exhaustive manual review of all 7,241 sentences), Gemini 3 Pro achieves 83.6{\%} recall (95{\%} confidence interval: [79.3{\%}, 88.2{\%}]) and 99.1{\%} precision - substantially higher than other frontier LLMs. Two independent human experts achieved 62.4{\%} and 42.8{\%} recall respectively, a 20-percentage-point spread that reflects the difficulty of sparse-target error search. Even the union of both experts' findings (73.4{\%} recall) falls short of a single LLM run (83.6{\%}), while LLM-aided auditing reduces review effort by over 95{\%}. We analyze the trade-offs between batch size and recall, and release both a human-verified Gold Standard with per-error difficulty annotations and a globally corrected version of the Challenge Set."
}Markdown (Informal)
[Human-AI Annotation Error Auditing for Hebrew Diacritization with Frontier LLMs](https://preview.aclanthology.org/ingest-acl-workshops/2026.law-main.4/) (Gershuni & Shmidman, LAW 2026)
ACL