@inproceedings{castilho-2021-towards,
    title = "Towards Document-Level Human {MT} Evaluation: On the Issues of Annotator Agreement, Effort and Misevaluation",
    author = "Castilho, Sheila",
    editor = "Belz, Anya  and
      Agarwal, Shubham  and
      Graham, Yvette  and
      Reiter, Ehud  and
      Shimorina, Anastasia",
    booktitle = "Proceedings of the Workshop on Human Evaluation of NLP Systems (HumEval)",
    month = apr,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/ingest-emnlp/2021.humeval-1.4/",
    pages = "34--45",
    abstract = "Document-level human evaluation of machine translation (MT) has been raising interest in the community. However, little is known about the issues of using document-level methodologies to assess MT quality. In this article, we compare the inter-annotator agreement (IAA) scores, the effort to assess the quality in different document-level methodologies, and the issue of misevaluation when sentences are evaluated out of context."
}Markdown (Informal)
[Towards Document-Level Human MT Evaluation: On the Issues of Annotator Agreement, Effort and Misevaluation](https://preview.aclanthology.org/ingest-emnlp/2021.humeval-1.4/) (Castilho, HumEval 2021)
ACL