@inproceedings{ji-etal-2024-robust,
title = "Robust Guidance for Unsupervised Data Selection: Capturing Perplexing Named Entities for Domain-Specific Machine Translation",
author = "Ji, Seunghyun and
Sinulingga, Hagai Raja and
Kwon, Darongsae",
editor = "Melero, Maite and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.sigul-1.37/",
pages = "307--317",
abstract = "Low-resourced data presents a significant challenge for neural machine translation. In most cases, the low-resourced environment is caused by high costs due to the need for domain experts or the lack of language experts. Therefore, identifying the most training-efficient data within an unsupervised setting emerges as a practical strategy. Recent research suggests that such effective data can be identified by selecting `appropriately complex data' based on its volume, providing strong intuition for unsupervised data selection. However, we have discovered that establishing criteria for unsupervised data selection remains a challenge, as the `appropriate level of difficulty' may vary depending on the data domain. We introduce a novel unsupervised data selection method named `Capturing Perplexing Named Entities,' which leverages the maximum inference entropy in translated named entities as a metric for selection. When tested with the `Korean-English Parallel Corpus of Specialized Domains,' our method served as robust guidance for identifying training-efficient data across different domains, in contrast to existing methods."
}
Markdown (Informal)
[Robust Guidance for Unsupervised Data Selection: Capturing Perplexing Named Entities for Domain-Specific Machine Translation](https://preview.aclanthology.org/fix-sig-urls/2024.sigul-1.37/) (Ji et al., SIGUL 2024)
ACL