@inproceedings{liu-etal-2024-translico,
title = "{T}ransli{C}o: A Contrastive Learning Framework to Address the Script Barrier in Multilingual Pretrained Language Models",
author = "Liu, Yihong and
Ma, Chunlan and
Ye, Haotian and
Schuetze, Hinrich",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.acl-long.136/",
doi = "10.18653/v1/2024.acl-long.136",
pages = "2476--2499",
abstract = "The world{'}s more than 7000 languages are written in at least 293 scripts. Due to various reasons, many closely related languages use different scripts, which poses a difficulty for multilingual pretrained language models (mPLMs) in learning crosslingual knowledge through lexical overlap. As a consequence, mPLMs are faced with a script barrier: representations from different scripts are located in different subspaces, which can result in crosslingual transfer involving languages of different scripts performing suboptimally. To address this problem, we propose TransliCo, a framework that optimizes the Transliteration Contrastive Modeling (TCM) objective to fine-tune an mPLM by contrasting sentences in its training data and their transliterations in a unified script (in our case Latin), which enhances uniformity in the representation space for different scripts. Using Glot500-m, an mPLM pretrained on over 500 languages, as our source model, we fine-tune it on a small portion (5{\%}) of its training data, and refer to the resulting model as Furina. We show that Furina not only better aligns representations from distinct scripts but also outperforms the original Glot500-m on various zero-shot crosslingual transfer tasks. Additionally, we achieve consistent improvement in a case study on the Indic group where the languages exhibit areal features but use different scripts. We make our code and models publicly available."
}
Markdown (Informal)
[TransliCo: A Contrastive Learning Framework to Address the Script Barrier in Multilingual Pretrained Language Models](https://preview.aclanthology.org/fix-sig-urls/2024.acl-long.136/) (Liu et al., ACL 2024)
ACL