@inproceedings{turker-eryigit-2026-instruction,
title = "Instruction-Following {LLM}s for Grammatical Error Correction: Analyzing Neutral-Anchored Instructional Sensitivity Across Editing Modes",
author = {T{\"u}rker, Tolgahan and
Eryi{\u{g}}it, G{\"u}l{\c{s}}en},
editor = "Kochmar, Ekaterina and
Alhafni, Bashar and
Bann{\`o}, Stefano and
Bexte, Marie and
Burstein, Jill and
Horbach, Andrea and
Laarmann-Quante, Ronja and
Tack, Anais and
Yaneva, Victoria and
Yuan, Zheng",
booktitle = "Proceedings of the 21st Workshop on Innovative Use of {NLP} for Building Educational Applications ({BEA} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.bea-1.17/",
pages = "234--247",
ISBN = "979-8-89176-409-5",
abstract = "Grammatical Error Correction (GEC) requires models to make edit decisions under competing objectives: correcting errors while either minimizing changes or maximizing fluency.However, we lack a principled characterization of how instruction-following Large Language Models (LLMs) shift their edit decisions across such editing modes, and whether standard evaluation setups faithfully reflect these shifts.We address this gap by defining three modes{---}Neutral, Minimal-Edit, and Fluency-Edit{---}and measuring neutral-anchored performance shifts to quantify instructional sensitivity.We benchmark seven LLMs, including proprietary and open-weight models, in a unified zero-shot prompting schema on CoNLL-2014, BEA-2019, and JFLEG datasets.The Minimal-Edit instruction mitigates over-editing and typically boosts precision; in some settings, strong models also improve recall, suggesting more selective and effective corrections.In contrast, the Fluency-Edit instruction often encourages broader paraphrastic rewriting that may improve perceived fluency while lowering GLEU, suggesting both a metric-objective mismatch and a shift away from targeted local correction.Notably, Claude-Sonnet-4.5 demonstrates superior zero-shot capabilities, outperforming previously reported scores and matching or even exceeding few-shot results across CoNLL-2014 (F{\_}0.5: 67.05), BEA-2019 (F{\_}0.5: 64.91), and JFLEG (GLEU: 66.09)."
}