@inproceedings{goyal-etal-2026-semantic,
title = "Semantic Span Annotation: An Exploratory Study of {LLM} Annotation",
author = "Goyal, Tejas and
Krishnan, Dhriti and
Gupta, Anuj and
Savelka, Jaromir",
editor = "T.Y.S.S., Santosh and
Rodriguez, Juan Diego and
de Gibert, Ona",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-srw.39/",
pages = "438--449",
ISBN = "979-8-89176-393-7",
abstract = "Structured span extraction research is siloed by context length, annotation task, and domain, making it difficult to assess how well large language models (LLMs) generalize across realistic extraction settings. We introduce SSA (Structured Span Annotation), a unified evaluation framework bringing together five datasets across four domains: finance, biomedicine, affective analysis, and privacy, under a common JSONL format with character-level offsets. We conduct an exploratory study evaluating seven models (three closed, four open-weight) under three prompting configurations: zero-shot, definition-augmented, and few-shot, formulating extraction as inline XML generation where models reproduce the document with tagged spans. Our results reveal two distinct performance regimes: on tasks requiring complex ontology reasoning, zero-shot performance is near zero (e.g., 0.00{\%} F1 on FiNER-139) but improves substantially with label definitions (e.g., Claude Opus 4.6 rises from 8.8{\%} to 57.5{\%} F1); on pattern-based tasks like PII detection, definitions consistently hurt performance across all models. These findings suggest that prompting strategy must be matched to task structure, and that unified evaluation frameworks spanning varied domains and input lengths are essential for understanding LLM extraction capabilities."
}Markdown (Informal)
[Semantic Span Annotation: An Exploratory Study of LLM Annotation](https://preview.aclanthology.org/ingest-acl/2026.acl-srw.39/) (Goyal et al., ACL 2026)
ACL
- Tejas Goyal, Dhriti Krishnan, Anuj Gupta, and Jaromir Savelka. 2026. Semantic Span Annotation: An Exploratory Study of LLM Annotation. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026), pages 438–449, San Diego, California, United States. Association for Computational Linguistics.