@inproceedings{barbosa-maua-2026-automated,
title = "Automated Essay Scoring for {B}razilian {P}ortuguese. Evidence from Cross-Prompt Evaluation of {ENEM} Essays",
author = "Barbosa, Andr{\'e} and
Mau{\'a}, Denis Deratani",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 2",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-dnd/2026.propor-2.11/",
pages = "43--48",
ISBN = "979-8-89176-387-6",
abstract = "Brazil{'}s ENEM, a high-stakes assessment determining university admission for millions of students annually, creates an immense evaluation burden where human raters process hundreds of essays daily. Automated Essay Scoring (AES) offers a potential solution, yet Portuguese-language systems remain understudied due to fragmented datasets and the complexity of ENEM{'}s multi-trait rubric. This work investigated cross-prompt, trait-specific essay scoring using a corpus of 385 essays across 38 prompts, where models evaluated essays on unseen prompts across five traits scored on a six-point ordinal scale. We compared three model classes: feature-based methods (72 features), encoder-only transformers (109M{--}1.5B parameters), and decoder architectures (2.4B{--}671B parameters) with fine-tuned and zero-shot configurations. Experiments under varying information access and rubric conditioning revealed that no single approach serves all evaluation needs: encoder models excel at mechanical traits (fluency, cohesion) despite context limitations; decoder models achieve superior performance on argumentation (QWK 0.73) and writing style (QWK 0.60) when provided full context; and language-specific pretraining benefits only surface-level features without improving complex reasoning. Best-performing models achieved QWK scores of 0.60{--}0.73. Gaps to oracle bounds ranged from 0.15 (argumentation) to 0.29 (writing style), with the largest disparities in writing style and persuasiveness."
}Markdown (Informal)
[Automated Essay Scoring for Brazilian Portuguese. Evidence from Cross-Prompt Evaluation of ENEM Essays](https://preview.aclanthology.org/ingest-dnd/2026.propor-2.11/) (Barbosa & Mauá, PROPOR 2026)
ACL