@inproceedings{gobara-etal-2025-speaker,
title = "Speaker Identification and Dataset Construction Using {LLM}s: A Case Study on {J}apanese Narratives",
author = "Gobara, Seiji and
Kamigaito, Hidetaka and
Watanabe, Taro",
editor = "Clark, Elizabeth and
Lal, Yash Kumar and
Chaturvedi, Snigdha and
Iyyer, Mohit and
Brei, Anneliese and
Modi, Ashutosh and
Chandu, Khyathi Raghavi",
booktitle = "Proceedings of the The 7th Workshop on Narrative Understanding",
month = may,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.wnu-1.17/",
pages = "97--119",
ISBN = "979-8-89176-247-3",
abstract = "Speaker identification in narrative analysis is a challenging task due to complex dialogues, diverse utterance patterns, and ambiguous character references. Cosly and time-intensive manual annotation limits the scalability of high-quality dataset creation.This study demonstrates a cost-efficient approach of constructing speaker identification datasets by combining small-scale manual annotation with LLM-based labeling. A subset of data is manually annotated and is used to guide LLM predictions with a few-shot approach followed by refinement through minimal human corrections. Our results show that LLMs achieve approximately 90{\%} accuracy on challenging narratives, such as the ``Three Kingdoms'' dataset, underscoring the importance of targeted human corrections. This approach proves effective for constructing scalable and cost-efficient datasets for Japanese and complex narratives."
}
Markdown (Informal)
[Speaker Identification and Dataset Construction Using LLMs: A Case Study on Japanese Narratives](https://preview.aclanthology.org/fix-sig-urls/2025.wnu-1.17/) (Gobara et al., WNU 2025)
ACL