@inproceedings{xu-etal-2020-unihanlm,
title = "{U}nihan{LM}: Coarse-to-Fine {C}hinese-{J}apanese Language Model Pretraining with the Unihan Database",
author = "Xu, Canwen and
Ge, Tao and
Li, Chenliang and
Wei, Furu",
editor = "Wong, Kam-Fai and
Knight, Kevin and
Wu, Hua",
booktitle = "Proceedings of the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing",
month = dec,
year = "2020",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2020.aacl-main.24/",
doi = "10.18653/v1/2020.aacl-main.24",
pages = "201--211",
abstract = "Chinese and Japanese share many characters with similar surface morphology. To better utilize the shared knowledge across the languages, we propose UnihanLM, a self-supervised Chinese-Japanese pretrained masked language model (MLM) with a novel two-stage coarse-to-fine training approach. We exploit Unihan, a ready-made database constructed by linguistic experts to first merge morphologically similar characters into clusters. The resulting clusters are used to replace the original characters in sentences for the coarse-grained pretraining of the MLM. Then, we restore the clusters back to the original characters in sentences for the fine-grained pretraining to learn the representation of the specific characters. We conduct extensive experiments on a variety of Chinese and Japanese NLP benchmarks, showing that our proposed UnihanLM is effective on both mono- and cross-lingual Chinese and Japanese tasks, shedding light on a new path to exploit the homology of languages."
}
Markdown (Informal)
[UnihanLM: Coarse-to-Fine Chinese-Japanese Language Model Pretraining with the Unihan Database](https://preview.aclanthology.org/fix-sig-urls/2020.aacl-main.24/) (Xu et al., AACL 2020)
ACL