@inproceedings{chiang-etal-2025-tract,
title = "{TRACT}: Regression-Aware Fine-tuning Meets Chain-of-Thought Reasoning for {LLM}-as-a-Judge",
author = "Chiang, Cheng-Han and
Lee, Hung-yi and
Lukasik, Michal",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.147/",
pages = "2934--2952",
ISBN = "979-8-89176-251-0",
abstract = "The LLM-as-a-judge paradigm uses large language models (LLMs) for automated text evaluation, assigning a score to the input based on scoring rubrics. Existing methods for fine-tuning LLM-as-a-judge use cross-entropy (CE) loss, which neglects the numeric nature of score prediction. Recent work addresses numerical prediction limitations of LLM fine-tuning through regression-aware fine-tuning but does not consider chain-of-thought (CoT) reasoning for score prediction. In this paper, we introduce TRACT (Two-stage Regression-Aware fine-tuning with CoT), which combines CoT reasoning with regression-aware training. TRACT uses a two-stage process: first, it fine-tunes the seed LLM to generate CoTs, which serve as the training data for the second stage; next, it uses these self-generated CoTs to retrain the seed LLM. The fine-tuning objective of TRACT applies CE loss for CoT reasoning and regression-aware loss for the score. Experiments across four LLM-as-a-judge datasets and two LLMs show that TRACT significantly outperforms existing methods. Extensive ablation studies validate the effectiveness of each component in TRACT."
}
Markdown (Informal)
[TRACT: Regression-Aware Fine-tuning Meets Chain-of-Thought Reasoning for LLM-as-a-Judge](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.147/) (Chiang et al., ACL 2025)
ACL