@inproceedings{bunn-etal-2025-fine,
title = "Fine-Tune on the Format: First Improving Multiple-Choice Evaluation for Intermediate {LLM} Checkpoints",
author = "Bunn, Alec and
Wiegreffe, Sarah and
Bogin, Ben",
editor = "Dhole, Kaustubh and
Clinciu, Miruna",
booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
month = jul,
year = "2025",
address = "Vienna, Austria and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/transition-to-people-yaml/2025.gem-1.46/",
pages = "511--521",
ISBN = "979-8-89176-261-9",
abstract = "Evaluation of intermediate language model checkpoints during training is critical for effective model development and selection. How-ever, reliable evaluation using the popular multiple-choice question (MCQ) format is challenging, as small and non instruction-tunedmodels often lack the symbolic reasoning required for the task. This is despite the fact that MCQ evaluation is often used and needed todistinguish between the performance of different training runs. In particular, when prompted with a question and a set of labeled answerchoices (e.g., ``A. . . . , B. . . . , C. . . . ``), many models struggle to emit the correct label (e.g., ``C''), even when they can select the correct string answer choice. We propose an alternative evaluation method: fine-tuning the model on an auxiliary MCQ dataset prior to outputting labels. We validate this approach empirically by showing that training on auxiliary data improves MCQ ability on all our test datasets except 1. This approach provides a more accurate signal of model capability at intermediate checkpoints, as it disentangles the evaluation of core knowledge from the model{'}s emerging ability to follow formatting instructions."
}
Markdown (Informal)
[Fine-Tune on the Format: First Improving Multiple-Choice Evaluation for Intermediate LLM Checkpoints](https://preview.aclanthology.org/transition-to-people-yaml/2025.gem-1.46/) (Bunn et al., GEM 2025)
ACL