@inproceedings{samuel-straka-2021-ufal,
title = "{{\'U}FAL} at {M}ulti{L}ex{N}orm 2021: Improving Multilingual Lexical Normalization by Fine-tuning {B}y{T}5",
author = "Samuel, David and
Straka, Milan",
editor = "Xu, Wei and
Ritter, Alan and
Baldwin, Tim and
Rahimi, Afshin",
booktitle = "Proceedings of the Seventh Workshop on Noisy User-generated Text (W-NUT 2021)",
month = nov,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2021.wnut-1.54/",
doi = "10.18653/v1/2021.wnut-1.54",
pages = "483--492",
abstract = "We present the winning entry to the Multilingual Lexical Normalization (MultiLexNorm) shared task at W-NUT 2021 (van der Goot et al., 2021a), which evaluates lexical-normalization systems on 12 social media datasets in 11 languages. We base our solution on a pre-trained byte-level language model, ByT5 (Xue et al., 2021a), which we further pre-train on synthetic data and then fine-tune on authentic normalization data. Our system achieves the best performance by a wide margin in intrinsic evaluation, and also the best performance in extrinsic evaluation through dependency parsing. The source code is released at \url{https://github.com/ufal/multilexnorm2021} and the fine-tuned models at \url{https://huggingface.co/ufal}."
}
Markdown (Informal)
[ÚFAL at MultiLexNorm 2021: Improving Multilingual Lexical Normalization by Fine-tuning ByT5](https://preview.aclanthology.org/jlcl-multiple-ingestion/2021.wnut-1.54/) (Samuel & Straka, WNUT 2021)
ACL