@inproceedings{kalbaliyev-sirts-2025-towards,
title = "Towards Evaluation of Language Models with Skill Dimensions: A Case Study on Narrative Question Answering",
author = "Kalbaliyev, Emil and
Sirts, Kairit",
editor = "Frermann, Lea and
Stevenson, Mark",
booktitle = "Proceedings of the 14th Joint Conference on Lexical and Computational Semantics (*SEM 2025)",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.starsem-1.34/",
pages = "430--440",
ISBN = "979-8-89176-340-1",
abstract = "Large language models have demonstrated varying levels of competence across a range of reasoning tasks, but coarse-grained evaluations often do not reflect their specific strengths and weaknesses, particularly in complex tasks such as Narrative Question Answering. In this paper, we advocate for a multi-dimensional skill-based evaluation that assesses models across distinct core skill dimensions. Our proposed skill-focused evaluation framework offers a granular and more realistic measure of model performance, revealing targeted areas for improvement and guiding future development. Experiments on Narrative Question Answering demonstrate that dimension-level analysis captures the multifaceted nature of the task and informs more effective model evaluation."
}Markdown (Informal)
[Towards Evaluation of Language Models with Skill Dimensions: A Case Study on Narrative Question Answering](https://preview.aclanthology.org/ingest-emnlp/2025.starsem-1.34/) (Kalbaliyev & Sirts, *SEM 2025)
ACL