% This must be in the first 5 lines to tell arXiv to use pdfLaTeX, which is strongly recommended.
\pdfoutput=1
% In particular, the hyperref package requires pdfLaTeX in order to break URLs across lines.

\documentclass[11pt]{article}

% Remove the "review" option to generate the final version.
\usepackage{ACL2023}

% Standard package includes
\usepackage{times}
\usepackage{latexsym}
\usepackage{booktabs}
\usepackage{amssymb}

% For proper rendering and hyphenation of words containing Latin characters (including in bib files)
\usepackage[T1]{fontenc}
% For Vietnamese characters
% \usepackage[T5]{fontenc}
% See https://www.latex-project.org/help/documentation/encguide.pdf for other character sets

% This assumes your files are encoded as UTF8
\usepackage[utf8]{inputenc}

% This is not strictly necessary, and may be commented out.
% However, it will improve the layout of the manuscript,
% and will typically save some space.
\usepackage{microtype}

% This is also not strictly necessary, and may be commented out.
% However, it will improve the aesthetics of text in
% the typewriter font.
\usepackage{inconsolata}

\newcommand{\alkis}[1]{ {\color{blue} AK: #1} }
\newcommand{\eliana}[1]{ {\color{purple} EP: #1} }
\newcommand{\irene}[1]{ {\color{olive} IB: #1} }
\newcommand{\lorenzo}[1]{ {\color{orange} LV: #1} }

\usepackage{comment}
\usepackage{hwemoji}

% If the title and author information does not fit in the area allocated, uncomment the following
%
%\setlength\titlebox{<dim>}
%
% and set <dim> to something 5cm or larger.

%\title{MAINDZ (MINDS + MAIZE) at SemEval-2023 Task 5: CLUEDO}
\title{MAINDZ at SemEval-2024 Task 5: CLUEDO - Choosing Legal oUtcome by Explaining Decision through Oversight }

%\title{MAINDZ at SemEval-2024 Task 5: CLUEDO  - Choosing Legal Outcome by Explaining Decision through Oversight }
% \emoji{detective}

%\textbf{C}hoosing \textbf{L}egal o\textbf{U}tcome by \textbf{E}xplaining \textbf{D}ecisions through \textbf{O}versight

% 
%
%


\author{
    Irene Benedetto$^{1,2}$ \\\And
    Alkis Koudounas$^{1}$ \\\And
    Lorenzo Vaiani$^{1}$ \\\AND
    Eliana Pastor$^{1}$ \\\And
    Luca Cagliero$^{1}$ \\ \\
    \textsuperscript{1} Politecnico di Torino, \texttt{\{name.surname\}@polito.it} \\
    \textsuperscript{2} MAIZE, \texttt{\{name.surname\}@maize.io} \\\And
    Francesco Tarasconi$^{2}$ \\
  }

\begin{document}
\maketitle
\begin{abstract}

Large language models (LLMs) have recently obtained strong performance on complex reasoning tasks. 
However, their capabilities in specialized domains like law remain relatively unexplored. 
We present CLUEDO, a system to tackle a novel legal reasoning task that involves determining if a provided answer correctly addresses a legal question derived from U.S. civil procedure cases. CLUEDO utilizes multiple collaborator models that are trained using multiple-choice prompting to choose the right label and generate explanations. These collaborators are overseen by a final "detective" model that identifies the most accurate answer in a zero-shot manner. Our approach achieves an F1 macro score of 0.74 on the development set and 0.76 on the test set, outperforming individual models. Unlike the powerful GPT-4, CLUEDO provides more stable predictions thanks to the ensemble approach. Our results showcase the promise of tailored frameworks to enhance legal reasoning capabilities in LLMs. 

\end{abstract}


\section{Introduction}
Recent improvements in large language models are leading to a rethinking of legal practices, particularly in the United States~\cite{frankenreiter2022natural, hoffman2023generative, glaze2021artificial}. This can potentially transform time-consuming tasks such as brief writing and corporate compliance~\cite{guha2023legalbench,inbook}. 
This could also contribute to alleviating the access-to-justice crisis~\cite{justicegap, tito2017ai}. The unique properties of LLMs, including their ability to learn from limited labeled data and proficiency in complex reasoning tasks, make them appealing for legal applications~\cite{zheng2021does, guha2023legalbench,benedetto-etal-2023-politohfi,benedetto2024}.

However, enthusiasm is tempered by concerns about the risks associated with LLMs, such as generating offensive, misleading, or factually incorrect content~\cite{engstrom2020legal, bender2021dangers}. These issues could have significant consequences, particularly affecting marginalized or under-resourced populations~\cite{surden2020ethics, gptconspiracy, koudounas2023icassp, koudounas2024taslp}.

To address safety implications, there is a pressing need to evolve and enhance legal reasoning capabilities in LLMs.
Despite this urgency, practitioners face challenges in assessing LLMs' legal reasoning capabilities, as existing legal benchmarks are limited and often fail to capture the diverse aspects of legal tasks~\cite{guha2023legalbench}. 

In this direction, the organizers of SemEval-2024 Task 5 introduce a novel Natural Language Processing (NLP) task and dataset derived from the U.S. civil procedure domain~\cite{bongard-etal-2022-legal}. 
Each dataset instance comprises a case introduction, a specific question, and a potential solution argument, along with an in-depth analysis justifying the argument's applicability to the case. 
%The dataset's source, a legal textbook designed for law students, heightens the complexity of the task, making it a challenging benchmark for evaluating the capabilities of contemporary legal language models.
When provided with a topic introduction, a question, and a potential answer, the objective of the proposed task is to determine whether the given answer is accurate or not.

To tackle this task, we initially transform the dataset into a multiple-choice question answering problem using the multiple-choice prompting (MCP) approach~\cite{robinson2023leveraging}. We experimented with various open-source language models on this modified dataset, including Flan T5 XXL~\cite{wei2021finetuned, chung2022scaling}, LLama 7B and 13B~\cite{touvron2023llama}, Zephyr 7B~\cite{touvron2023llama}, and Mistral 7B~\cite{jiang2023mistral}. Specifically, we trained these models to solve legal problems while also providing an explanation for the predicted outcome, leveraging the analysis provided.
We thus introduce the \textit{CLUEDO} approach, which stands for ``\textbf{C}hoosing \textbf{L}egal o\textbf{U}tcome by \textbf{E}xplaining \textbf{D}ecisions through \textbf{O}versight''. 
This framework utilizes multiple collaborative models to synthesize the final outcome based on each model's predictions. Each individual model is trained to predict the label of the correct candidate answer and generate an explanation accordingly. 
The final \textit{``detective''} model operates in a zero-shot manner, relying upon the outputs of the collaborators.
The model processes the answers and the explanations of all collaborators and deduces the ultimate answer.


The results on the challenge dataset demonstrate that our proposed methodology surpasses the performance of single models trained with standard fine-tuning. Furthermore, our approach achieved the second-place position in the public competition, achieving a final test F1 macro score of 0.77\footnote{Code available at \url{https://github.com/irenebenedetto/PoliToHFI-SemEval2024-Task5}}.

\paragraph{Research Questions.}
We investigate the following research questions (RQs):
\begin{itemize}
\item   \textbf{RQ1.} Is the multiple-choice setting more effective than the single-choice one?
 
\item  \textbf{RQ2.} Does including the analysis in the training and generation process improve performance?

\item \textbf{RQ3.} Is our detective model CLUEDO more effective than individual collaborators in a zero-shot setting?
Are CLUEDO results more stable?
\end{itemize}

\section{Related Work}

%The integration of artificial intelligence (AI) technologies in various domains has revolutionized traditional practices and paved the way for enhanced efficiency and accuracy. 
In the legal domain, the advent of Legal LLMs has reshaped how legal professionals approach case analysis, decision-making, and document generation processes~\cite{lai2023large}. 
LLMs possess logical reasoning capabilities that enable legal professionals to comprehend case processes, aid judges in decision-making, swiftly identify similar cases through language comprehension, analyze and condense essential case details, and utilize automated content generation to draft repetitive legal documents~\cite{guha2023legalbench}.
%In particular, LLMs have been instrumental in solving various tasks within the legal domain, including legal machine translation~\cite{brivaiglesias2024large}, legal NER~\cite{bernsohn2024legallens}, legal violation retrieval~\cite{zhou2023boosting} and court judgment prediction~\cite{shui2023comprehensive} tasks. 
%This promising potential of legal LLMs in addressing a wide range of reasoning tasks within the legal domain offers a multifaceted approach to enhancing legal decision-making processes and augmenting the efficiency of legal practitioners in navigating complex legal frameworks.
Researchers have recently started exploring whether large language models have the capability to carry out legal reasoning. Unlike BERT-based models, LLMs are evaluated on their ability to learn tasks in-context, primarily through prompting~\cite{liu2022fewshot}.  Studies have explored the role of prompt-engineering for Legal Judgment Prediction~\cite{jiang2023legal}, statutory reasoning~\cite{blairstanek2023gpt3} legal exams~\cite{yu-etal-2023-exploring}. Several case studies~\cite{nay2023large,drápal2023using,Savelka_2023,savelka2023explaining,westermann2023llmediator} highlight the potential and the limitations of GPT models in real use cases. However, to the best of our knowledge, limited effort has been devoted to analyzing the effectiveness of smaller and open-source language models (e.g., Llama 2~\cite{touvron2023llama}) in this domain~\cite{guha2023legalbench}, and how they can effectively be employed in conjunction with closed-source foundational models, such as GPT-4~\cite{openai2023gpt4}.


%potential applications [93, 29, 147, 117, 116]

%, questions regarding human-LLM interaction [31, 63], and comparisons to older finetuned-models [91].


%\irene{secondo me manca un po' di focus sui modelli applicati ai legal reasoning, secondo me \url{https://arxiv.org/pdf/2308.11462.pdf} ti può aiutare, soprattutto la parte che va da "Importantly, the majority [...]" fino a "[...]older finetuned-models"}

\section{Dataset and Task Description}
\citet{bongard-etal-2022-legal} present a new dataset from the U.S. civil procedure domain. This dataset is derived from a book intended for law students, suggesting its complexity and suitability for benchmarking modern legal language models. Each instance of the dataset consists of: 
\begin{itemize}
    \item \textit{General introduction to the case}: an overview of the case to set the context.
    \item \textit{Particular question}: a specific legal question related to the case is presented.
    \item \textit{Possible solution argument}: a potential answer associated with the question is provided.
    \item \textit{Annotated label}: it defines if the possible solution is correct (1) or not (0).
    \item \textit{Detailed analysis}: Accompanying each solution argument is a thorough analysis explaining why the argument applies to the case in question.
\end{itemize}

The task is structured as a binary classification task where the goal is to predict the correctness of the answer provided, i.e., the label provided together with the textual information.
The analysis and the labels are not available during test time. 

\begin{table*}[]
\caption{\textbf{Zero-shot} models on dev set. The best performance (in terms of F1 macro) for each model family is in bold. The multiple-choice approach leads to higher performance in five out of six cases.}
\label{tab:zero-shot-results}
\centering
\begin{tabular}{llcccc}
\toprule
\multicolumn{1}{c}{\textbf{Model}} & \multicolumn{1}{c}{\textbf{Classification task}} & \multicolumn{1}{c}{\textbf{Prec}} & \multicolumn{1}{c}{\textbf{Rec}} & \multicolumn{1}{c}{\textbf{F1}} & \multicolumn{1}{c}{\textbf{Acc}} \\
\midrule
Flan T5 XXL 
    & Multiple choice 
    & 0.60 
    & 0.67
    & \textbf{0.59}
    & 0.64 \\
Flan T5 XXL  
    & Single choice   
    & 0.54   
    & 0.53
    & 0.32  
    & 0.32 \\
\midrule

GPT-4 
    & Multiple choice   
    & 0.66  
    & 0.73
    & \textbf{0.66}
    & 0.57 \\
GPT-4 
    & Single choice 
    & 0.40   
    & 0.50
    & 0.44  
    & 0.80 \\
\midrule

Llama 2 13B  
    & Multiple choice        
    & 0.64   
    & 0.58
    & \textbf{0.59
    }& 0.79 \\
Llama 2 13B  
    & Single choice        
    & 0.55   
    & 0.58
    & 0.54  
    & 0.61 \\     
\midrule

Llama 2 7B   
    & Multiple choice        
    & 0.51   
    & 0.51
    & 0.51  
    & 0.74 \\
Llama 2 7B   
    & Single choice   
    & 0.53   
    & 0.52
    & \textbf{0.52}  
    & 0.73 \\
\midrule

Mistral v0.1 7B    
    & Multiple choice        
    & 0.55   
    & 0.59
    & \textbf{0.54}
    & 0.61 \\
Mistral v0.1 7B    
    & Single choice   
    & 0.55   
    & 0.58
    & 0.52  
    & 0.57 \\
\midrule

Zephyr beta 7B     
    & Multiple choice     
    & 0.54   
    & 0.56
    & \textbf{0.50}
    & 0.69  \\   
Zephyr beta 7B     
    & Single choice  
    & 0.40   
    & 0.50
    & 0.44  
    & 0.80 \\
\bottomrule

\end{tabular}
\end{table*}

\section{System Overview}
This section provides a comprehensive overview of the proposed methodology. Firstly, we outline the approach to the multiple-choice question-answering problem and how we adapt it to our scenario. Secondly, we introduce the \textit{CLUEDO} framework, along with details about the competitors incorporated into our study.

\paragraph{Multiple-choice.}
Following the intuition of~\citet{robinson2023leveraging}, we convert the dataset into a multiple-choice question answering problem and adopt multiple choice prompting (MCP)~\cite{robinson2023leveraging}. In MCP, the language model is presented not only with the question but also with a set of candidate answers, akin to a multiple-choice test. Each answer is linked to a symbol such as \textit{``A,''} \textit{``B,''} or \textit{``C.''} 
This approach enables the model to compare answer choices explicitly and diminishes computational expenses for a generation.
In cases where there is only one candidate answer, the system automatically generates the alternative \textit{``None of the above is true''}. These additional answers are not accounted in the test and validation metrics. 


%which is subsequently excluded during validation and testing.
%\irene{inserire perchè}

In our experiments, we evaluate whether the multi-choice approach is indeed more effective than a single-choice approach. In the single-choice setting, we prompt a single choice, and the model should directly predict whether it is correct.  


\begin{table}[]
\centering
\caption{\textbf{Trained} models performance on dev set. All models are trained to generate both labels and analysis, following the multiple-choice setting.}
\label{tab:peft-training}
\begin{tabular}{p{2.5cm}p{0.5cm}p{0.5cm}p{0.5cm}p{0.5cm}}
%\scalebox{0.87}{%
\toprule
\multicolumn{1}{c}{\textbf{Model}} & \multicolumn{1}{c}{\textbf{Prec}} & \multicolumn{1}{c}{\textbf{Rec}} & \multicolumn{1}{c}{\textbf{F1}} & \multicolumn{1}{c}{\textbf{Acc}} \\
\midrule

Llama 2 7B  
    & 0.57 
    & 0.60 
    & 0.56 
    & 0.64  \\
Mistral v0.1 7B  
    & 0.61  
    & 0.63  
    & 0.62 
    & 0.73 \\
Zephyr beta 7B                     
    & 0.62                                         
    & 0.65                                      
    & 0.63                                        
    & 0.73      \\
Llama 2 13B                        
    & 0.65                                         
    & 0.69                                      
    & \textbf{0.66}                               
    & 0.75   \\

\bottomrule
\end{tabular} %}
\end{table}

\paragraph{CLUEDO.} 
To tackle the task of the challenge, we introduce the \textit{CLUEDO} framework, which stands for ``Choosing Legal Outcome by Explaining Decisions through Oversight.'' In a nutshell, multiple collaborative models are trained to predict the correct label for a candidate answer that addresses the legal question. These models generate their analysis as part of their training. The final model, operating in a zero-shot manner, utilizes the responses and explanations from the set of collaborators to identify the most accurate final answer, considering their collective performance. 
More in detail, the \textit{CLUEDO} system is structured as follows:
\begin{itemize}
    \item \textit{N collaborative models}: given the introduction, the legal question, and the candidate answers, these models are trained to predict the label of the candidate answer that correctly responds to the legal question and generate an explanation. We fix the number of collaborators equal to three. We select the collaborators based on their results on the dev set. 
    \item \textit{The final ``detective'' model}: this model is employed in a zero-shot manner. Based on the responses from the collaborators and their corresponding explanations, this model must identify the most accurate final answer, overseeing the collaborators' performance. The final model is also provided with the introduction, legal questions, and candidate answers.
\end{itemize}
Example of prompts for collaborative and detective models are reported in Table~\ref{tab:prompts}.

\begin{table*}[]
\caption{\textbf{Example of prompts} for collaborative models and our CLUEDO approach.}
\label{tab:prompts}

\centering
\small
\begin{tabular}{cl}

\toprule
\textbf{Approach} &
  \multicolumn{1}{c}{\textbf{Example Prompt}} \\
\midrule
\rotatebox{90}{~\parbox{1.05cm}{Collaborative Models}} &
%collaborative models &
  \begin{tabular}[c]{@{}l@{}}\textless{}s\textgreater{}{[}INST{]} \textless{}\textless{}SYS\textgreater{}\textgreater 
  Given the following explanation and the question, which of the candidate \\ answers is correct? The correct answer is the one that is true according to the explanation. \textless{}\textless{}/SYS\textgreater{}\textgreater\\ \\ \textless{}explanation\textgreater 
  Although discovery usually extends to all evidence relevant to claims and defenses \\ in the action, Rule 26(b)(1) expressly carves out one {[}...{]} \textless{}/explanation\textgreater\\ \\ \textless{}question\textgreater 4. Confidential chat. 
  Shag, a budding rock star with no business experience, enters into \\ 
  a five-year exclusive contract with Fringe Records, after {[}...{]} \textless{}/question\textgreater\\ \\ \textless{}candidate\_answers\textgreater \\ 
  1 - Shag will not have to answer any of the interrogatories, because all three were discussed in å \\
  confidence with Rivera in the course of his representation.\\ 
  2 - Shag will have to answer the first interrogatory, but not the other two.\\ 
  3 - Shag will have to answer all three interrogatories, because [...] \\ 
  5 - None of the above is true. \\ \textless{}/candidate\_answers\textgreater\\ \\ {[}/INST{]}\\ \\ \textless{}correct\_answer\textgreater 5 \textless{}/correct\_answer\textgreater\\ \\ \textless{}analysis\textgreater Let’s start by eliminating A. It proceeds on the premise that all three items are subject \\ to discovery, because all {[}...{]} \\ \textless{}/analysis\textgreater{}\end{tabular} \\
\midrule

%CLUEDO & 
\rotatebox{90}{~\parbox{1.05cm}{CLUEDO}} &
\begin{tabular}[c]{@{}l@{}}You are a legal supervisor tasked with resolving legal queries. \\ You are working alongside three artificial intelligence models, named m1, m2, and m3. \\ Given an introductory context, a question, and a set of candidate answers, these three models \\  must choose the correct answer and provide justification for their choice. Your responsibility  \\ is to assess the models' responses and determine whether they are correct or not. \\ To do so, you must read the context (enclosed within the tags \textless{}context\textgreater{}\textless{}/context\textgreater{}), the question  \\  (within \textless{}question\textgreater{}\textless{}/question\textgreater tags), and the candidate answers (within  \textless{}candidate\_answers\textgreater{} \\ \textless{}/candidate\_answers\textgreater tags), and identify the correct answer among them (using the \\  \textless{}supervisor\_answer\textgreater tag). Additionally, you must provide reasoning for your choice (using the \\  \textless{}supervisor\_explanation\textgreater tag). While collaborating with the models and considering their advice, \\ the ultimate decision rests \\  with you. For each response, use the following format:\\ \textless{}supervisor\_answer\textgreater{}SUPERVISOR ANSWER\textless{}/supervisor\_answer\textgreater\\ \textless{}supervisor\_explanation\textgreater{}SUPERVISOR ANSWER\textless{}/supervisor\_explanation\textgreater\\ \\ \textless{}context\textgreater 
Although discovery usually extends to all evidence relevant to claims and defenses \\ in the action, Rule 26(b)(1) expressly carves out one  {[}...{]} \textless{}/context\textgreater\\ \\ \textless{}question\textgreater 4. Confidential chat. Shag, a budding rock star with no business experience, \\ enters into a five-year exclusive contract with Fringe Records, after {[}...{]} \textless{}/question\textgreater\\ \\ \textless{}candidate\_answers\textgreater \\ 
1 - Shag will not have to answer any of the interrogatories, because all three were discussed in \\ å confidence with Rivera in the course of his representation.\\ 
2 - Shag will have to answer the first interrogatory, but not the other two.\\ 
3 - Shag will have to answer all three interrogatories, because [...] \\
5 - None of the above is true. \\ \textless{}/candidate\_answers\textgreater\\ \\ \textless{}m1\_answer\textgreater{}1\textless{}/m1\_answer\textgreater\\ \textless{}m1\_explanation\textgreater {[}...{]} \textless{}/m1\_explanation\textgreater\\ \\ \textless{}m2\_answer\textgreater{}1\textless{}/m2\_answer\textgreater\\ \textless{}m2\_explanation\textgreater  {[}...{]}  \textless{}/m2\_explanation\textgreater\\ \\ \textless{}m3\_answer\textgreater{}2\textless{}/m3\_answer\textgreater\\ \textless{}m3\_explanation\textgreater  {[}...{]} \textless{}/m3\_explanation\textgreater\\ \\ \textless{}supervisor\_answer\textgreater
\end{tabular} \\
\bottomrule

\end{tabular}
\end{table*}


\paragraph{Competitors.}
To assess the strength of the proposed CLUEDO approach, we compare the results with a set of alternatives on the final test set: the best collaborator chosen based on the results achieved on the dev set (that we call \textit{Best collaborator}), and the correction of collaborator models based on consensus (after named \textit{Collaborators agreement}). The latter approach involves taking the predictions of the top-performing collaborator (on the dev set) and rectifying instances where both the second and third collaborators mutually confirm inaccuracies. We finally employ the zero-shot final model without any collaborators to test its generalization capabilities, namely \textit{Zero-shot detective model}.

\begin{table*}[]
\centering
\caption{\textbf{Trained} models on dev set. The best results (in terms of F1 Macro) are in bold. The generation of the analysis leads to higher performance for both 7B and 13B models.}
\label{tab:analysis-results}

\begin{tabular}{llccccc}
\toprule

\multicolumn{1}{c}{\textbf{Model}} & \multicolumn{1}{c}{\textbf{Classification task}} & \textbf{Analysis included} & \multicolumn{1}{c}{\textbf{Prec}} & \multicolumn{1}{c}{\textbf{Rec}} & \multicolumn{1}{c}{\textbf{F1}} & \multicolumn{1}{c}{\textbf{Acc}} \\
\midrule
Llama 2 7B   
    & Multiple choice     
    & x     
    & 0.49 
    & 0.48     
    & 0.47       
    & 0.56 \\
Llama 2 7B   
    & Multiple choice       
    & \checkmark   
    & 0.57        
    & 0.60     
    & \textbf{0.56}
    & 0.64 \\
    
Llama 2 7B   
    & Single choice   
    & x       
    & 0.40        
    & 0.50     
    & 0.44       
    & 0.80 \\
Llama 2 7B   
    & Single choice  
    & \checkmark   
    & 0.40        
    & 0.50    
    & 0.44       
    & 0.80 \\
\midrule

Llama 2 13B  
    & Single choice   
    & x       
    & 0.55        
    & 0.58     
    & 0.52       
    & 0.57 \\
Llama 2 13B  
    & Multiple choice       
    & \checkmark    
    & 0.65        
    & 0.69     
    & \textbf{0.66}
    & 0.75 \\
\midrule
\end{tabular}
\end{table*}

\begin{table*}[]

\caption{ \textbf{Final Results} on dev and test sets: the best collaborator, collaborative agreements, and collaborators within CLUEDO are trained to generate the analysis along with the labels and adopt the MCP approach.}
\label{tab:test-results}
\centering
\begin{tabular}{lcc|cc}
\toprule
\textbf{}      & \multicolumn{2}{c}{\textbf{Dev}}& \multicolumn{2}{c}{\textbf{Test}} \\
\multicolumn{1}{c}{\textbf{Method}}  & \multicolumn{1}{c}{\textbf{F1}} & \multicolumn{1}{c}{\textbf{Acc}} & \multicolumn{1}{c}{\textbf{F1}} & \multicolumn{1}{c}{\textbf{Acc}} \\
\midrule
Best collaborator   
    & 0.66  ($\pm$ 0.001) 
    & 0.75  ($\pm$ 0.001)  
    & 0.69  ($\pm$ 0.001)      
    & 0.75  ($\pm$ 0.001) \\
Collaborators agreement
    & 0.65   ($\pm$ 0.001) 
    & 0.75  ($\pm$ 0.001)  
    & 0.65  ($\pm$ 0.001)      
    & 0.75  ($\pm$ 0.001) \\
\midrule
Zero-shot detective model 
    & 0.63 ($\pm$ 0.038)          
    & 0.71  ($\pm$ 0.024)       
    & \textbf{0.77} ($\pm$ 0.022)     
    & 0.83  ($\pm$ 0.016)\\
CLUEDO
    & \textbf{0.74} ($\pm$ 0.017)       
    & 0.78   ($\pm$ 0.017)  
    & \textbf{0.77} ($\pm$ 0.017)  
    & 0.82  ($\pm$ 0.013) \\
\midrule
\end{tabular}
\end{table*}

\section{Experimental Setup}

\noindent \textbf{Models.}
We evaluated various open-source models, employing both zero-shot and fine-tuning methodologies. Our analysis covered Flan T5 XXL~\cite{wei2021finetuned, chung2022scaling}, LLama 7B~\cite{touvron2023llama} and 13B, Zephyr 7B~\cite{touvron2023llama}, and Mistral 7B~\cite{jiang2023mistral}, selected for their unique features and performance metrics. Furthermore, we integrated into our assessment GPT-4~\cite{openai2023gpt4} in a zero-shot context.

\noindent \textbf{Training procedure.}
We employed a Supervised Fine-Tuning (SFT) approach, implementing precision enhancement with 8-bit quantization. The models were trained for three epochs utilizing Parameter-Efficient Fine-Tuning % ocio Positional Embedding Fine-Tuning 
(PEFT)~\cite{peft}, with a batch size set at 4 and a learning rate of 5e-5. The sequences were processed with a context length of 4096, optimizing the model's ability to capture long-range dependencies in the data. 

\noindent \textbf{Hardware.}
We run the experiments on a machine equipped with Intel\textsuperscript{\textregistered}  Core\textsuperscript{TM} i9-10980XE CPU, $1$ $\times$ Nvidia\textsuperscript{\textregistered} Tesla T4 GPU, $16$ GB of RAM running Ubuntu $22.04$ LTS. 

\section{Results}

To illustrate the efficacy of the multiple-choice setting and model selection criteria, we conduct individual tests for each configuration and present the obtained results on the development set. The following paragraphs address the research questions previously presented.


% We investigate the following research questions (RQs):
% \begin{itemize}
% \item   \textbf{RQ1.} Is the multiple-choice setting more effective than the single-choice one?
 
% \item  \textbf{RQ2.} Does including the analysis in the training and generation process improve performance?

% \item \textbf{RQ3.} Is our detective model CLUEDO more effective than individual collaborators in a zero-shot setting?
% Are CLUEDO results more stable?
% \end{itemize}


%%% qui rispondiamo alla RQ: single o multi? cos'è meglio per fare reasoning anche senza training?
\paragraph{RQ1: Impact of the multiple-choice setting.}
Table~\ref{tab:zero-shot-results} shows the zero-shot models' performance on the development set. 
For each model family, the multiple-choice question-answering approach consistently outperforms the single-choice approach in terms of F1 Macro.
There is variability in the performance of different models within the same family. In general, larger models tend to exhibit stronger generalization capabilities than smaller ones. 

%%% qui rispondiamo alla RQ: generare l'analisi (modelli addestrati) è di beneficio per le performance? 
\paragraph{RQ2: Impact of analysis inclusion in model training.}
In Table~\ref{tab:analysis-results}, we highlight the impact of including the analysis in the models' training process.  To examine outcomes across various model sizes and classification tasks, we fixed the model family (Llama 2 from Meta). 
In both the 7B and 13B models, including the analysis (\checkmark) consistently leads to higher performance for multiple-choice tasks.  In particular, including the analysis during training leads to more balanced precision and recall metrics, resulting in an overall improvement in the F1 Macro score. 
For both Llama 2 7B and Llama 2 13B, the F1 Macro scores in single-choice tasks do not show significant improvement with the inclusion of the analysis. This may indicate that these models are less sensitive to additional analysis in single-choice tasks. 

Additionally, the training of Llama 2 13B with the analysis allows for an additional $+0.07$ F1 score compared to its zero-shot counterpart, while for the 7B models, the training deteriorates the performance.


%%% qui rispondiamo alla RQ: Come si comporta CLUEDO? Con che criterio abbiamo scelto le componenti?  
\paragraph{RQ3: CLUEDO results.}

The selection of collaborative models is guided by the results obtained on the development set as shown in Table~\ref{tab:peft-training}. All models are configured to generate both labels and analysis, following the multiple-choice setting. Among the models, Llama 2 13B stands out with the highest F1 Macro score, indicating robust performance across multiple evaluation metrics, followed by Mistral and Zephyr models. For the supervisor model, we choose GPT-4, the best performer in the zero-shot setting (see Table~\ref{tab:zero-shot-results}).

Results on the test set are summarized in Table~\ref{tab:test-results}. Applying corrections based on the consensus of the second and third collaborators (Mistral and Zephyr) slightly reduces the F1 Macro to 0.65 on both development and test sets. This suggests that the initial collaborator's predictions were already quite accurate. 
The zero-shot model without collaborators (GPT-4) performs well on the development set with an F1 score of 0.63. 
However, it surpasses all other methods on the test set with a notable F1 Macro of 0.77, showcasing its robust generalization capabilities. The CLUEDO model outperforms other methods with the highest F1 Macro on the development set (0.74) while achieving the second-highest score on test data. 
To assess the stability of predictions, we experimented five times on the validation set and test set and measured the performance of the models. Even with a greedy decoding strategy, small discrepancies regarding floating point operations lead to divergent generations, especially for larger models~\cite{DBLP:journals/corr/abs-2107-03342}. It is known that this issue primarily concerns GPT-4\footnote{Here some discussion of the OpenAI community on models variability: \url{https://community.openai.com/t/why-the-api-output-is-inconsistent-even-after-}
\url{the-temperature-is-set-to-0/329541}, \url{https://community.openai.com/t/run-same-query-many-times-different-results/140588}}. Therefore, even though the temperature is set to 0 for all experiments, users have often reported significant variations in the output.


Although the predictions of trained models remained consistent, notable differences were observed in GPT-4 predictions, particularly when used without collaborators (the temperature is set to zero with no sampling). The results are presented in Table~\ref{tab:test-results}. With the proposed CLUEDO approach, the standard deviation is reduced by half. Additionally, the error estimate on the development set aligns with the one obtained on the test set. In conclusion, even though CLUEDO may not outperform others on test data, it ensures higher stability in predictions.


\section{Conclusion}

This paper presents a novel solution to the SemEval 2024 - Legal Reasoning Task, which introduced a challenge for evaluating contemporary legal language models. 
We transform the original dataset into a multiple-choice question-answering problem using the multiple-choice prompting approach and propose an original system, namely \textit{CLUEDO}, that utilizes multiple collaborative LLMs and employs a final \textit{``detective''} model to predict the outcome. Results show that our framework outperforms individual models in the public competition while returning more stable predictions, securing second place in the public competition.


%\section{Acknowledgments}


\bibliography{custom}
\bibliographystyle{acl_natbib}


\end{document}