@inproceedings{dinh-etal-2024-sciex, title = "{S}ci{E}x: Benchmarking Large Language Models on Scientific Exams with Human Expert Grading and Automatic Grading", author = {Dinh, Tu Anh and Mullov, Carlos and B{\"a}rmann, Leonard and Li, Zhaolin and Liu, Danni and Rei{\ss}, Simon and Lee, Jueun and Lerzer, Nathan and Gao, Jianfeng and Peller-Konrad, Fabian and R{\"o}ddiger, Tobias and Waibel, Alexander and Asfour, Tamim and Beigl, Michael and Stiefelhagen, Rainer and Dachsbacher, Carsten and B{\"o}hm, Klemens and Niehues, Jan}, editor = "Al-Onaizan, Yaser and Bansal, Mohit and Chen, Yun-Nung", booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing", month = nov, year = "2024", address = "Miami, Florida, USA", publisher = "Association for Computational Linguistics", url = "https://preview.aclanthology.org/fix-sig-urls/2024.emnlp-main.647/", doi = "10.18653/v1/2024.emnlp-main.647", pages = "11592--11610" }