@inproceedings{liu-etal-2026-llms,
title = "Would {LLM}s be Good Historical Linguists and {C}hinese Dialect Learners?",
author = "Liu, Yicheng and
Shi, Shumin and
Zhou, Youchao and
Zhang, Xingchen",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.831/",
pages = "18231--18256",
ISBN = "979-8-89176-390-6",
abstract = "Large language models (LLMs) perform well on Standard Chinese but struggle with low-resource Chinese dialects due to substantial phonological divergence. We investigate whether incorporating Middle Chinese, the common historical ancestor of most of the modern Chinese dialects, can improve dialectal pronunciation modeling in a linguistically interpretable manner. We focus on two specific task variants: (1) conditional sound change rule induction (a variant of Sound Law Induction, SLI), where models infer executable phonological transformation rules from Middle Chinese to modern dialects, and (2) sentence-level dialectal pronunciation transcription (a variant of Grapheme-to-Phoneme, G2P), requiring dialect-specific International Phonetic Alphabet (IPA) generation. We construct a multi-source dataset covering Middle Chinese and 12 modern Chinese dialects, including character-level correspondences, rule exemplars, and sentence-level IPA transcription. We adopt a parameter-efficient training framework combining LoRA-based supervised fine-tuning and reinforcement learning via Group Relative Policy Optimization (GRPO) for the first task. Across both tasks and a wide range of dialects and evaluation metrics, our approach achieves overall improvements over strong baselines, including DeepSeek-V3.2 and ChatGPT-5.2, while revealing variation across dialects. These results demonstrate the value of leveraging historical linguistic knowledge for modeling low-resource Chinese dialects."
}Markdown (Informal)
[Would LLMs be Good Historical Linguists and Chinese Dialect Learners?](https://preview.aclanthology.org/ingest-acl/2026.acl-long.831/) (Liu et al., ACL 2026)
ACL
- Yicheng Liu, Shumin Shi, Youchao Zhou, and Xingchen Zhang. 2026. Would LLMs be Good Historical Linguists and Chinese Dialect Learners?. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 18231–18256, San Diego, California, United States. Association for Computational Linguistics.