@inproceedings{arabov-2026-tajperslexon,
title = "{T}aj{P}ers{L}exon: A {T}ajik{--}{P}ersian Lexical Resource and Hybrid Model for Cross-Script Low-Resource {NLP}",
author = "Arabov, Mullosharaf Kurbonovich",
editor = "Merchant, Rayyan and
Megerdoomian, Karine",
booktitle = "The Proceedings of the First Workshop on {NLP} and {LLM}s for the {I}ranian Language Family",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/manual-author-scripts/2026.silkroadnlp-1.4/",
pages = "29--37",
ISBN = "979-8-89176-371-5",
abstract = "This work introduces TajPersLexon, a curated Tajik{--}Persian parallel lexical resource of 40,112 word and short-phrase pairs for cross-script lexical retrieval, transliteration, and alignment in low-resource settings. We conduct a comprehensive CPU-only benchmark comparing three methodological families:(i) a lightweight hybrid pipeline, (ii) neural sequence-to-sequence models, and (iii) retrieval methods. Our evaluation establishes that the task is essentially solvable, with neural and retrieval baselines achieving 98-99{\%} top-1 accuracy. Crucially, we demonstrate that while large multilingual sentence transformers fail on this exact lexical matching, our interpretable hybrid model offers a favorable accuracy-efficiency trade-off for practical applications, achieving 96.4{\%} accuracy in an OCR post-correction task. All experiments use fixed random seeds for full reproducibility. The dataset, code, and models will be publicly released."
}Markdown (Informal)
[TajPersLexon: A Tajik–Persian Lexical Resource and Hybrid Model for Cross-Script Low-Resource NLP](https://preview.aclanthology.org/manual-author-scripts/2026.silkroadnlp-1.4/) (Arabov, SilkRoadNLP 2026)
ACL