import pandas as pd
import model_evaluation as me
import data_processing as dp
import plotting as plt

def run_rte_comparison_analysis(wvc_test):
    # Load RTE models
    rte_models = me.load_rte_models()

    # Load and preprocess data
    eval_df = wvc_test.copy()

    # Evaluate RTE models
    me.evaluate_models(rte_models, eval_df)

    # Load LLM scores
    llm_scores = dp.load_llm_scores()

    # Combine RTE and LLM scores
    scored_df = me.combine_scores(eval_df, llm_scores)

    # Evaluate and compare models
    comp_rdf = me.compare_models(scored_df)

    # Get performance breakdown by class
    accuracy_scores_3class = me.compare_models_by_class(scored_df)

    # Select top-performing models
    top_perf_df, summary_df = me.select_top_models(comp_rdf, accuracy_scores_3class)

    display(summary_df)

    # Plot confusion matrices
    conf_mat = plt.plot_confmat(top_perf_df,
                                 models=['res_rob', 'llm_output_annot_instr', 'roberta_mnli'],
                                 model_names=['Res-RoBERTa WVC', 'GPT-3 (Annotator\n Instr. w/ Instructions)', 'RoBERTa MNLI'])
    
    return top_perf_df, summary_df, conf_mat


def compare_prefix_models(prefix_test_df):
    # Evaluate and compare hypothesis structure models
    eval_results_dict={'Resonance-Tuned RoBERTa':{},
                        'RoBERTa-large MNLI':{}}
    for prefix in ['raw_stem','author_believes','text_expresses','text_expresses_belief', 'original_hypothesis']:
        eval_results_dict['Resonance-Tuned RoBERTa'][f"{prefix}"] = dp.collect_results(prefix_test_df, model = prefix)
        eval_results_dict['RoBERTa-large MNLI'][f"{prefix}"] = dp.collect_results(prefix_test_df, model = prefix, model_type = 'RTE')

    comp_rdf=pd.DataFrame(columns=['hyp_format','model','accuracy','precision','recall','F1','conf_mat'])
    hyps=['text_expresses', 'author_believes',
        'raw_stem', 'text_expresses_belief', 'original_hypothesis']
    
    for model in eval_results_dict.keys():
        for hyp in hyps:
            acc_df=me.score_df(eval_results_dict[model][f'{hyp}'][0].prediction.apply(lambda x: 2 if x=='entailment' else 0 if x=='contradiction' else 1).values, eval_results_dict[model][f'{hyp}'][0].label)
            comp_rdf=pd.concat([comp_rdf,pd.DataFrame({'hyp_format':hyp,
                        'model':model,
                        'accuracy':acc_df['accuracy'],
                        'precision':acc_df['precision'],
                        'recall':acc_df['recall'],
                        'F1':acc_df['F1'],
                        'conf_mat':acc_df['conf_mat']})]).reset_index(drop=True)
    return eval_results_dict, comp_rdf


def conduct_cross_data_model_evaluation(wvc_test, touche_test, noise_test, complete_test):
    run_results={}
    dfs={'wvc':wvc_test,
        'touche':touche_test,
        'noise':noise_test,
         'complete':complete_test
         }
    
    df_names={'wvc':'WVC Test Set',
            'touche':'Touche Test Set',
            'noise':'Noise Test Set',
           'complete':'Complete (WVC+Touche+Noise)'
           }

    model_names=['WVC',
    'HVE','Complete','Entailment'
    ]
    dp.execute_error_analysis(run_results, dfs, df_names, model_names)
    
    model_recode_dict={'WVC':'Res-RoBERTa WVC',
                        'HVE':'Res-RoBERTa HVE',
                        'Complete':'Res-RoBERTa Complete',
                        'Entailment':'RoBERTa MNLI'
    }

    df_names={'wvc':'WVC',
            'touche':'Touche HV',
            'noise':'Noise',
               'complete':'Complete'
               }
    
    results_df=dp.collect_results_df(run_results,['WVC',
                                                'HVE','Complete','Entailment'
                                                ], model_recode_dict, df_names)
    bar, box = plt.generate_comparison_charts(results_df)

    return results_df, bar, box