@inproceedings{selialia-etal-2026-mitigating,
title = "Mitigating Tokenization-Induced Distance Distortion in Long-Context Multilingual Machine Translation",
author = "Selialia, Khotso and
Nzeyimana, Antoine and
Anwar, Fatima M.",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1696/",
pages = "36591--36602",
ISBN = "979-8-89176-390-6",
abstract = "Multilingual neural machine translation (MNMT) models degrade in performance as input context length increases, causing positional encoding schemes to misinterpret token distances. Existing absolute and relative positional encodings rely on fixed token indices and implicitly assume uniform semantic density, which breaks down for long-context inputs. We introduce DCARPE, a tokenization-aware adaptive positional encoding that conditions relative positional bias on input-level sequence length and fragmentation statistics, allowing the model to reinterpret positional distance when tokenization-induced inflation arises rather than semantic factors. Evaluations on JW300 and out-of-distribution FLORES-200 demonstrate consistent improvements in long-context robustness, achieving gains of up to \textbf{+10.81 ChrF++} and \textbf{+8.00 BLEU} over baselines."
}Markdown (Informal)
[Mitigating Tokenization-Induced Distance Distortion in Long-Context Multilingual Machine Translation](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1696/) (Selialia et al., ACL 2026)
ACL