@inproceedings{liu-etal-2025-transmi,
title = "{T}rans{MI}: A Framework to Create Strong Baselines from Multilingual Pretrained Language Models for Transliterated Data",
author = {Liu, Yihong and
Ma, Chunlan and
Ye, Haotian and
Sch{\"u}tze, Hinrich},
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2025.coling-main.32/",
pages = "469--495",
abstract = "Transliterating related languages that use different scripts into a common script is effective for improving crosslingual transfer in downstream tasks. However, this methodology often makes pretraining a model from scratch unavoidable, as transliteration brings about new subwords not covered in existing multilingual pretrained language models (mPLMs). This is undesirable because it requires a large computation budget. A more promising way is to make full use of available mPLMs. To this end, this paper proposes a simple but effective framework: Transliterate-Merge-Initialize (TransMI). TransMI can create strong baselines for data that is transliterated into a common script by exploiting an existing mPLM and its tokenizer without any training. TransMI has three stages: (a) transliterate the vocabulary of an mPLM into a common script; (b) merge the new vocabulary with the original vocabulary; and (c) initialize the embeddings of the new subwords. We apply TransMI to three strong recent mPLMs. Our experiments demonstrate that TransMI not only preserves the mPLM`s ability to handle non-transliterated data, but also enables it to effectively process transliterated data, thereby facilitating crosslingual transfer across scripts. The results show consistent improvements of 3{\%} to 34{\%} for different mPLMs and tasks. We make our code and models publicly available at \url{https://github.com/cisnlp/TransMI}."
}
Markdown (Informal)
[TransMI: A Framework to Create Strong Baselines from Multilingual Pretrained Language Models for Transliterated Data](https://preview.aclanthology.org/jlcl-multiple-ingestion/2025.coling-main.32/) (Liu et al., COLING 2025)
ACL