@inproceedings{zweigenbaum-etal-2016-supervised,
title = "Supervised classification of end-of-lines in clinical text with no manual annotation",
author = "Zweigenbaum, Pierre and
Grouin, Cyril and
Lavergne, Thomas",
editor = "Ananiadou, Sophia and
Batista-Navarro, Riza and
Cohen, Kevin Bretonnel and
Demner-Fushman, Dina and
Thompson, Paul",
booktitle = "Proceedings of the Fifth Workshop on Building and Evaluating Resources for Biomedical Text Mining ({B}io{T}xt{M}2016)",
month = dec,
year = "2016",
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://preview.aclanthology.org/fix-sig-urls/W16-5109/",
pages = "80--88",
abstract = "In some plain text documents, end-of-line marks may or may not mark the boundary of a text unit (e.g., of a paragraph). This vexing problem is likely to impact subsequent natural language processing components, but is seldom addressed in the literature. We propose a method which uses no manual annotation to classify whether end-of-lines must actually be seen as simple spaces (soft line breaks) or as true text unit boundaries. This method, which includes self-training and co-training steps based on token and line length features, achieves 0.943 F-measure on a corpus of short e-books with controlled format, F=0.904 on a random sample of 24 clinical texts with soft line breaks, and F=0.898 on a larger set of mixed clinical texts which may or may not contain soft line breaks, a fairly high value for a method with no manual annotation."
}
Markdown (Informal)
[Supervised classification of end-of-lines in clinical text with no manual annotation](https://preview.aclanthology.org/fix-sig-urls/W16-5109/) (Zweigenbaum et al., 2016)
ACL