@inproceedings{kotoge-sasaki-2026-data,
title = "Data-efficient Targeted Token-level Preference Optimization for {LLM}-based Text-to-Speech",
author = "Kotoge, Rikuto and
Sasaki, Yuichi",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 2: Short Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-short.59/",
pages = "719--726",
ISBN = "979-8-89176-391-3",
abstract = "Aligning text-to-speech (TTS) system outputs with human feedback through preference optimization has been shown to effectively improve the robustness and naturalness of LLM-based TTS models. Current approaches primarily require paired desirable and undesirable samples at the utterance level. However, such pairs are often limited in TTS output data, and utterance-level formulation prevents fine-grained token-level optimization needed for accurate pronunciation alignment. In this study, we propose TKTO that eliminates the need for paired data, enabling a more data-efficient training paradigm, and directly targets token-level units, automatically providing fine-grained alignment signals without token-level annotations. TKTO improves the challenging Japanese TTS accuracy by 39{\%} and reduces CER by 54{\%}, leveraging 6{\texttimes} more training data and assigning 12.8{\texttimes} stronger reward to targeted tokens."
}Markdown (Informal)
[Data-efficient Targeted Token-level Preference Optimization for LLM-based Text-to-Speech](https://preview.aclanthology.org/ingest-acl/2026.acl-short.59/) (Kotoge & Sasaki, ACL 2026)
ACL