from openai import OpenAI
import pandas as pd
import json
from tqdm import tqdm
import time
import math
from template import EVALUATION_PROMPT_TEMPLATE, RELEVANCY_SCORE_CRITERIA, RELEVANCY_SCORE_STEPS, COHERENCE_SCORE_CRITERIA, COHERENCE_SCORE_STEPS, CONSISTENCY_SCORE_CRITERIA, CONSISTENCY_SCORE_STEPS, FLUENCY_SCORE_CRITERIA, FLUENCY_SCORE_STEPS, PLAUSIBILITY_SCORE_CRITERIA, PLAUSIBILITY_SCORE_STEPS

# Evaluation prompt template based on G-Eval

EVALUATION_PROMPT_TEMPLATE = """
You will be given a question asked in a finance-related community on Reddit and a comment from another user intended to answer the question.

Your task is to rate the comment on one metric.

Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.

Evaluation Criteria:

{criteria}

Evaluation Steps:

{steps}

Example:


Question:

{question}

Comment:

{comment}


Evaluation (respond with SCORE ONLY):

- {metric}:
"""

RELEVANCE_SCORE_CRITERIA = """
Relevance (1-5) - the provision of a suitable response to the question.
The comment's explanations should only contain appropriate content to answer the question.
Comments that contain anecdotes, jokes, or off-topic information are penalized strongly.
"""

RELEVANCE_SCORE_STEPS = """
1. Read the question and comment carefully.
2. Identify the main points of the question and comment.
3. Assess the relevance of the comment following the definition of relevance provided above.
4. Assign a score from 1 to 5 to rate the comment. Use the full range of scores to indicate excellent relevance with a 5 and poor relevance with a 1.
"""

SPECIFICITY_SCORE_CRITERIA = """
Specificity (1-5) - the comment is concise and specific to the question.
The comment should be concise and cover the same scope as the question. The comment should not contain a broad overview of a topic or provide context if not necessary to comprehend the comment.
Comments that contain additional information beyond what is necessary to solve the question are penalized strongly.
"""

SPECIFICITY_SCORE_STEPS = """
1. Read the question and comment carefully.
2. Identify the main points of the question and comment.
3. Assess the specificity of the comment following the definition of specificity provided above.
4. Assign a score from 1 to 5 to rate the comment. Use the full range of scores to indicate excellent specificity with a 5 and poor specificity with a 1.
"""

SIMPLICITY_SCORE_CRITERIA = """
Simplicity (1-5) - the understandability of the comment.
The comment is written in simple language that is easy to understand and suitable for the target audience of Reddit users.
Sentences that have a complex or long structure, or use difficult words are penalized strongly.
"""

SIMPLICITY_SCORE_STEPS = """
1. Read the comment carefully.
2. Identify the main points of the comment.
3. Assess the simplicity of the comment following the definition of simplicity provided above.
4. Assign a score from 1 to 5 to rate the comment. Use the full range of scores to indicate excellent simplicity with a 5 and poor simplicity with a 1.
"""

HELPFULNESS_SCORE_CRITERIA = """
Helpfulness (1-5) - the level of friendliness, helpfulness, and constructiveness in the comment's language. 
The response aims to solve the author's question and help them understand the solution in a friendly and polite manner.
Responses that are unconstructive or contain any type of toxicity are penalized strongly.
"""

HELPFULNESS_SCORE_STEPS = """
1. Read the question and comment carefully.
2. Identify the main points of the comment.
3. Assess the helpfulness of the comment following the definition of helpfulness provided above.
4. Assign a score from 1 to 5 to rate the comment. Use the full range of scores to indicate excellent helpfulness with a 5 and poor helpfulness with a 1.
"""

OBJECTIVITY_SCORE_CRITERIA = """
Objectivity (1-5) - the level of impartiality within a comment.
The comment contains an objective answer to the question.
Responses that are opinionated or biased are penalized.
"""

OBJECTIVITY_SCORE_STEPS = """
1. Read the question and comment carefully.
2. Identify the main points of the comment.
3. Assess the objectivity of the comment following the definition of objectivity shown above.
4. Assign a score from 1 to 5 to rate the comment. Use the full range of scores to indicate excellent objectivity with a 5 and poor objectivity with a 1.
"""

def get_geval_score(
    criteria: str, steps: str, document: str, summary: str, metric_name: str
):
    prompt = EVALUATION_PROMPT_TEMPLATE.format(
        criteria=criteria,
        steps=steps,
        metric_name=metric_name,
        document=document,
        summary=summary,
    )
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        # model= "gpt-4-0125-preview", # 500k TPD limit
        # model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        seed=42,
        temperature=0,
        max_tokens=5,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        logprobs=True,
        top_logprobs=5,
    )
    logprobs = response.choices[0].logprobs.content[0].top_logprobs
    current_choice_sum = 0
    for logprob in logprobs:
        try:
            choice = float(logprob.token)
        except:
            continue
        probability = math.exp(logprob.logprob)
        mult_result = choice * probability
        current_choice_sum += mult_result
    return current_choice_sum


evaluation_metrics = {
    "Relevance": (RELEVANCE_SCORE_CRITERIA, RELEVANCE_SCORE_STEPS),
    "Specificity": (SPECIFICITY_SCORE_CRITERIA, SPECIFICITY_SCORE_STEPS),
    "Simplicity": (SIMPLICITY_SCORE_CRITERIA, SIMPLICITY_SCORE_STEPS),
    "Helpfulness": (HELPFULNESS_SCORE_CRITERIA, HELPFULNESS_SCORE_STEPS),
    "Objectivity": (OBJECTIVITY_SCORE_CRITERIA, OBJECTIVITY_SCORE_STEPS),
}
with open("eval/data/complete_qa_final_filtered_test.json", "r") as input_file:
    big_results = json.load(input_file)

client = OpenAI()

result = []
requests = 0
for el in tqdm(big_results):
    cur_result = {}
    excerpt = el["text"] + " " + el["context"]
    relevant_attribute = "answer_1"
    text_to_test = el[relevant_attribute]
    cur_result[relevant_attribute] = text_to_test
    for eval_type, (criteria, steps) in evaluation_metrics.items():
        if requests == 0:
            start = time.time()
        if requests == 60:
            end = time.time()
            delta = end - start
            if delta < 60:
                print(f"sleeping for {60 - delta}")
                time.sleep(60 - delta)
            requests = 0
            start = time.time()
        requests += 1
        eval_prediction = get_geval_score(criteria, steps, excerpt, text_to_test, eval_type)
        try:
            score_num = eval_prediction
            print(f"{eval_type} Score: {score_num}")
        except:
            score_num = None
            print(f"{eval_type} Score: Text was empty!")
        cur_result[eval_type] = score_num
    result.append(cur_result)

final_df = pd.DataFrame.from_dict(result)
final_df.to_csv(f"{relevant_attribute}_predictions_openai_3_5_logprobs.csv", index=False)