@inproceedings{dinh-etal-2024-sciex,
    title = "{S}ci{E}x: Benchmarking Large Language Models on Scientific Exams with Human Expert Grading and Automatic Grading",
    author = {Dinh, Tu Anh  and
      Mullov, Carlos  and
      B{\"a}rmann, Leonard  and
      Li, Zhaolin  and
      Liu, Danni  and
      Rei{\ss}, Simon  and
      Lee, Jueun  and
      Lerzer, Nathan  and
      Gao, Jianfeng  and
      Peller-Konrad, Fabian  and
      R{\"o}ddiger, Tobias  and
      Waibel, Alexander  and
      Asfour, Tamim  and
      Beigl, Michael  and
      Stiefelhagen, Rainer  and
      Dachsbacher, Carsten  and
      B{\"o}hm, Klemens  and
      Niehues, Jan},
    editor = "Al-Onaizan, Yaser  and
      Bansal, Mohit  and
      Chen, Yun-Nung",
    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-sig-urls/2024.emnlp-main.647/",
    doi = "10.18653/v1/2024.emnlp-main.647",
    pages = "11592--11610"
}