@inproceedings{dai-etal-2025-next,
title = "Next-Level {C}antonese-to-{M}andarin Translation: Fine-Tuning and Post-Processing with {LLM}s",
author = "Dai, Yuqian and
Chan, Chun Fai and
Wong, Ying Ki and
Pun, Tsz Ho",
editor = "Hettiarachchi, Hansi and
Ranasinghe, Tharindu and
Rayson, Paul and
Mitkov, Ruslan and
Gaber, Mohamed and
Premasiri, Damith and
Tan, Fiona Anting and
Uyangodage, Lasitha",
booktitle = "Proceedings of the First Workshop on Language Models for Low-Resource Languages",
month = jan,
year = "2025",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.loreslm-1.32/",
pages = "427--436",
abstract = "Large Language Models (LLMs) have improved performance across various natural language processing tasks. Despite these improvements, LLMs continue to face significant challenges, such as grammatical issues and code-switching to English, when applied to low-resource languages like Cantonese in Machine Translation (MT) scenarios. By addressing the unique linguistic and contextual challenges of Cantonese, we present a novel strategy to improve the understanding and translation capabilities of LLMs for Cantonese-to-Mandarin MT. Our strategy comprises three key components: (1) Syntax and Part-of-Speech (POS) fine-tuning, where we use the Universal Dependencies (UD) corpus to fine-tune LLM, focusing on the linguistic structures of Cantonese; (2) Specialized Cantonese to Mandarin sentence pairs, collected from diverse sources such as Cantonese grammar textbooks and manually translated sentences across various domains, to expose the model to a wide range of linguistic contexts; (3) Post-processing with additional LLMs, where we introduce additional LLMs to improve the initial translations, correcting Mandarin grammar and punctuation. Empirical evaluations on human-created test sets show that our proposed strategy improves translation performance and outperforms existing commercial translation models with at least 3 BLEU scores. Additionally, our strategy also benefits other LLMs and a reversed translation direction, demonstrating its generalization and effectiveness."
}
Markdown (Informal)
[Next-Level Cantonese-to-Mandarin Translation: Fine-Tuning and Post-Processing with LLMs](https://preview.aclanthology.org/fix-sig-urls/2025.loreslm-1.32/) (Dai et al., LoResLM 2025)
ACL