@inproceedings{shapiro-2016-splitting,
title = "Splitting compounds with ngrams",
author = "Shapiro, Naomi Tachikawa",
editor = "Matsumoto, Yuji and
Prasad, Rashmi",
booktitle = "Proceedings of {COLING} 2016, the 26th International Conference on Computational Linguistics: Technical Papers",
month = dec,
year = "2016",
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/C16-1061/",
pages = "630--640",
abstract = "Compound words with unmarked word boundaries are problematic for many tasks in NLP and computational linguistics, including information extraction, machine translation, and syllabification. This paper introduces a simple, proof-of-concept language modeling approach to automatic compound segmentation, as applied to Finnish. This approach utilizes an off-the-shelf morphological analyzer to split training words into their constituent morphemes. A language model is subsequently trained on ngrams composed of morphemes, morpheme boundaries, and word boundaries. Linguistic constraints are then used to weed out phonotactically ill-formed segmentations, thereby allowing the language model to select the best grammatical segmentation. This approach achieves an accuracy of {\textasciitilde}97{\%}."
}
Markdown (Informal)
[Splitting compounds with ngrams](https://preview.aclanthology.org/jlcl-multiple-ingestion/C16-1061/) (Shapiro, COLING 2016)
ACL
- Naomi Tachikawa Shapiro. 2016. Splitting compounds with ngrams. In Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers, pages 630–640, Osaka, Japan. The COLING 2016 Organizing Committee.