@inproceedings{zhang-etal-2026-morphology,
title = "Morphology-Aware Multi-Granularity Representation Learning for Agglutinative Languages",
author = "Zhang, Zhonghao and
Liu, Na and
Ma, Jiajia and
Wu, Nier and
Liu, Guiping",
editor = "T.Y.S.S., Santosh and
Rodriguez, Juan Diego and
de Gibert, Ona",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-srw.92/",
pages = "1065--1073",
ISBN = "979-8-89176-393-7",
abstract = "Low-resource agglutinative languages, characterized by rich morphological inflection and severe vocabulary sparsity in corpora, have long posed numerous challenges in the field of representation learning. Word-level representations preserve semantic integrity but struggle to handle sparse surface forms, whereas morpheme-level representations, though easier to learn, often lack holistic semantic information. Existing multi-granularity methods are typically modeled at the word and phrase levels, with very limited application to low-resource agglutinative languages. Focusing on the morphemes of agglutinative languages, this paper proposes MAGNet, a morphology-aware gated multi-granularity pre-training framework. At the morpheme granularity, this framework leverages morphological knowledge and integrates morpheme segmentation with morphological tagging to construct fine-grained representations. It further introduces a morphology-aware masked language modeling objective to facilitate the model in learning functional morphological regularities. Meanwhile, at the word granularity, a word-level encoder is employed to capture contextual semantics and maintain its semantic coherence.Finally, a gated fusion mechanism dynamically fuses representations of different granularities according to the context. Experiments conducted on two low-resource agglutinative languages, Mongolian and Turkish, for the tasks of dependency parsing and named entity recognition (NER) demonstrate that our method achieves consistent performance improvements over strong baseline models. Ablation studies further validate the complementary roles of morphological tagging and whole-word modeling in efficient representation learning."
}Markdown (Informal)
[Morphology-Aware Multi-Granularity Representation Learning for Agglutinative Languages](https://preview.aclanthology.org/ingest-acl/2026.acl-srw.92/) (Zhang et al., ACL 2026)
ACL