@inproceedings{albertsson-etal-2016-similarity,
title = "Similarity-Based Alignment of Monolingual Corpora for Text Simplification Purposes",
author = {Albertsson, Sarah and
Rennes, Evelina and
J{\"o}nsson, Arne},
editor = "Brunato, Dominique and
Dell{'}Orletta, Felice and
Venturi, Giulia and
Fran{\c{c}}ois, Thomas and
Blache, Philippe",
booktitle = "Proceedings of the Workshop on Computational Linguistics for Linguistic Complexity ({CL}4{LC})",
month = dec,
year = "2016",
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/W16-4118/",
pages = "154--163",
abstract = "Comparable or parallel corpora are beneficial for many NLP tasks. The automatic collection of corpora enables large-scale resources, even for less-resourced languages, which in turn can be useful for deducing rules and patterns for text rewriting algorithms, a subtask of automatic text simplification. We present two methods for the alignment of Swedish easy-to-read text segments to text segments from a reference corpus. The first method (M1) was originally developed for the task of text reuse detection, measuring sentence similarity by a modified version of a TF-IDF vector space model. A second method (M2), also accounting for part-of-speech tags, was developed, and the methods were compared. For evaluation, a crowdsourcing platform was built for human judgement data collection, and preliminary results showed that cosine similarity relates better to human ranks than the Dice coefficient. We also saw a tendency that including syntactic context to the TF-IDF vector space model is beneficial for this kind of paraphrase alignment task."
}
Markdown (Informal)
[Similarity-Based Alignment of Monolingual Corpora for Text Simplification Purposes](https://preview.aclanthology.org/jlcl-multiple-ingestion/W16-4118/) (Albertsson et al., CL4LC 2016)
ACL