import vertexai
from vertexai.preview.generative_models import GenerativeModel, Part
import json
import pandas as pd
from tqdm import tqdm
import time
import os
import re

# Evaluation prompt template based on G-Eval

EVALUATION_PROMPT_TEMPLATE = """
You will be given a question asked in a finance-related community on Reddit and a comment from another user intended to answer the question.

Your task is to rate the comment on one metric.

Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.

Evaluation Criteria:

{criteria}

Evaluation Steps:

{steps}

Example:


Question:

{question}

Comment:

{comment}


Evaluation (respond with SCORE ONLY):

- {metric}:
"""

RELEVANCE_SCORE_CRITERIA = """
Relevance (1-5) - the provision of a suitable response to the question.
The comment's explanations should only contain appropriate content to answer the question.
Comments that contain anecdotes, jokes, or off-topic information are penalized strongly.
"""

RELEVANCE_SCORE_STEPS = """
1. Read the question and comment carefully.
2. Identify the main points of the question and comment.
3. Assess the relevance of the comment following the definition of relevance provided above.
4. Assign a score from 1 to 5 to rate the comment. Use the full range of scores to indicate excellent relevance with a 5 and poor relevance with a 1.
"""

SPECIFICITY_SCORE_CRITERIA = """
Specificity (1-5) - the comment is concise and specific to the question.
The comment should be concise and cover the same scope as the question. The comment should not contain a broad overview of a topic or provide context if not necessary to comprehend the comment.
Comments that contain additional information beyond what is necessary to solve the question are penalized strongly.
"""

SPECIFICITY_SCORE_STEPS = """
1. Read the question and comment carefully.
2. Identify the main points of the question and comment.
3. Assess the specificity of the comment following the definition of specificity provided above.
4. Assign a score from 1 to 5 to rate the comment. Use the full range of scores to indicate excellent specificity with a 5 and poor specificity with a 1.
"""

SIMPLICITY_SCORE_CRITERIA = """
Simplicity (1-5) - the understandability of the comment.
The comment is written in simple language that is easy to understand and suitable for the target audience of Reddit users.
Sentences that have a complex or long structure, or use difficult words are penalized strongly.
"""

SIMPLICITY_SCORE_STEPS = """
1. Read the comment carefully.
2. Identify the main points of the comment.
3. Assess the simplicity of the comment following the definition of simplicity provided above.
4. Assign a score from 1 to 5 to rate the comment. Use the full range of scores to indicate excellent simplicity with a 5 and poor simplicity with a 1.
"""

HELPFULNESS_SCORE_CRITERIA = """
Helpfulness (1-5) - the level of friendliness, helpfulness, and constructiveness in the comment's language. 
The response aims to solve the author's question and help them understand the solution in a friendly and polite manner.
Responses that are unconstructive or contain any type of toxicity are penalized strongly.
"""

HELPFULNESS_SCORE_STEPS = """
1. Read the question and comment carefully.
2. Identify the main points of the comment.
3. Assess the helpfulness of the comment following the definition of helpfulness provided above.
4. Assign a score from 1 to 5 to rate the comment. Use the full range of scores to indicate excellent helpfulness with a 5 and poor helpfulness with a 1.
"""

OBJECTIVITY_SCORE_CRITERIA = """
Objectivity (1-5) - the level of impartiality within a comment.
The comment contains an objective answer to the question.
Responses that are opinionated or biased are penalized.
"""

OBJECTIVITY_SCORE_STEPS = """
1. Read the question and comment carefully.
2. Identify the main points of the comment.
3. Assess the objectivity of the comment following the definition of objectivity shown above.
4. Assign a score from 1 to 5 to rate the comment. Use the full range of scores to indicate excellent objectivity with a 5 and poor objectivity with a 1.
"""

def generate_text(model, parameters, samples, prompt: str) -> str:
    current_responses = []
    for _ in range(samples):
        response = model.generate_content(prompt, generation_config=parameters, stream=False)
        try:
            response_text = response.text
        except:
            response_text = "1.0"
            print("exception")
        digits = re.findall('\d+', response_text)
        if len(digits) == 0:
            response_text = "1.0"
        else:
            response_text = digits[0]
        response_text = float(response_text)
        if response_text < 1:
            print("score smaller 1")
            response_text = 1.0
        current_responses.append(response_text)
    return sum(current_responses) / len(current_responses)

def get_geval_score(
    criteria: str, steps: str, question: str, comment: str, metric: str, model, parameters, samples
):
    prompt = EVALUATION_PROMPT_TEMPLATE.format(
        criteria=criteria,
        steps=steps,
        metric=metric,
        question=question,
        comment=comment,
    )
    response = generate_text(model, parameters, samples, prompt=prompt)
    return response

evaluation_metrics = {
    "Relevance": (RELEVANCE_SCORE_CRITERIA, RELEVANCE_SCORE_STEPS),
    "Specificity": (SPECIFICITY_SCORE_CRITERIA, SPECIFICITY_SCORE_STEPS),
    "Simplicity": (SIMPLICITY_SCORE_CRITERIA, SIMPLICITY_SCORE_STEPS),
    "Helpfulness": (HELPFULNESS_SCORE_CRITERIA, HELPFULNESS_SCORE_STEPS),
    "Objectivity": (OBJECTIVITY_SCORE_CRITERIA, OBJECTIVITY_SCORE_STEPS),
}
def remove_prefix_and_suffix(text):
    text = text.removeprefix("[INST] ")
    text = text.removesuffix(" [/INST]")
    text = text.removeprefix("<|system|>\n\n<|user|>\n")
    text = text.removeprefix("<|system|>\n</s>\n<|user|>\n")
    text = text.removesuffix("\n<|assistant|>\n")
    text = text.removesuffix("</s>\n")
    return text


all_files = [f for f in os.listdir("./eval/final_data_to_handle/")]
all_files_filtered = [f for f in all_files if "1_2" in f]
for file_name in all_files_filtered:
    print(f"Handling {file_name}...")
    with open(f"eval/final_data_to_handle/{file_name}", "r") as input_file:
        big_results = json.load(input_file)

    result = []
    requests = 0
    vertexai.init(project="gemini-eval-419815", location="us-central1")
    model = GenerativeModel("gemini-pro")

    parameters = {
        "temperature": 0.0,  # Temperature controls the degree of randomness in token selection. -> set to 1.0 when sampling
        "max_output_tokens": 15,  # Token limit determines the maximum amount of text output.
        "top_p": 1.0,  # Tokens are selected from most probable to least until the sum of their probabilities equals the top_p value.
        "top_k": 5,  # A top_k of 1 means the selected token is the most probable among all tokens.
    }
    samples = 1

    for idx, el in tqdm(enumerate(big_results), total=len(big_results)):
        cur_result = {}
        excerpt = el["questions"]
        relevant_attribute = "prediction"
        text_to_test = el[relevant_attribute]
        excerpt = remove_prefix_and_suffix(excerpt)
        cur_result[relevant_attribute] = text_to_test
        for eval_type, (criteria, steps) in evaluation_metrics.items():
            if requests == 0:
                start = time.time()
            if requests == 300:
                end = time.time()
                delta = end - start
                if delta < 60:
                    # print(f"sleeping for {60 - delta}")
                    time.sleep(60 - delta)
                requests = 0
                start = time.time()
            requests += samples
            eval_prediction = get_geval_score(criteria=criteria, steps=steps, question=excerpt, comment=text_to_test, metric=eval_type, model=model, parameters=parameters, samples=samples)
            cur_result[eval_type] = eval_prediction
        result.append(cur_result)

    final_df = pd.DataFrame.from_dict(result)
    final_df.to_csv(f"./predictions_new_template_500_no_suffix/{file_name.split('.') [0]}.csv", index=False)
