@inproceedings{qiu-etal-2025-chengyusts,
title = "{C}hengyu{STS}: An Intrinsic Perspective on {M}andarin Idiom Representation",
author = "Qiu, Le and
Chersoni, Emmanuele and
Villavicencio, Aline",
editor = "Frermann, Lea and
Stevenson, Mark",
booktitle = "Proceedings of the 14th Joint Conference on Lexical and Computational Semantics (*SEM 2025)",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.starsem-1.1/",
pages = "1--12",
ISBN = "979-8-89176-340-1",
abstract = "Chengyu, or four-character idioms, are ubiquitous in both spoken and written Chinese. Despite their importance, chengyu are often underexplored in NLP tasks, and existing evaluation frameworks are primarily based on extrinsic measures. In this paper, we introduce an intrinsic evaluation task for Chinese idiomatic understanding: idiomatic semantic textual similarity (iSTS), which evaluates how well models can capture the semantic similarity of sentences containing idioms. To this purpose, we present a curated dataset: ChengyuSTS. Our experiments show that current pre-trained sentence Transformer models generally fail to capture the idiomaticity of chengyu in a zero-shot setting. We then show results of fine-tuned models using the SimCSE contrastive learning framework, which demonstrate promising results for handling idiomatic expressions. We also presented the results of DeepSeek for reference"
}Markdown (Informal)
[ChengyuSTS: An Intrinsic Perspective on Mandarin Idiom Representation](https://preview.aclanthology.org/ingest-emnlp/2025.starsem-1.1/) (Qiu et al., *SEM 2025)
ACL