import pandas as pd
import argparse
from collections import defaultdict
from question_prompts import templates
from pathlib import Path
import json
import re

def parse_args():
    parser = argparse.ArgumentParser(description='Question Creation')
    parser.add_argument('--input_file', type=str, default='../data/vanya_data/test.jsonl', help='JSONL file to generate question data from')
    parser.add_argument('--output_dir', type=str, required=True)
    parser.add_argument('--prompt_name', type=str, default='does_why_prompt', help='base dictionary key from prompts.py file corresponding to prompts to be used for question creation')
    args, _ = parser.parse_known_args()
    return args

def read_flowgraph_data_file(input_file):
    data = pd.read_json(input_file, lines=True)
    data['title'] = data['title'].str.replace('_-_All_recipes_UK', '')
    data['title'] = data['title'].str.replace('_recipe', '')
    data['title'] = data['title'].str.replace('_', ' ')
    data['title'] = data['title'].str.lower()
    data['counter'] = range(0, len(data))
    data = data.drop(columns=['file_name'])
    return data

def create_questions(steps, preceding_event_idx, succeeding_event_idx, prompt_name, question_base_type):
    
    before_why_question = templates[f'{prompt_name}_{question_base_type}']['before_why'].format(e1=preceding_event_idx+1, e2=succeeding_event_idx+1)
    before_binary_question = templates[f'{prompt_name}_{question_base_type}']['before_binary'].format(e1=preceding_event_idx+1, e2=succeeding_event_idx+1)
    
    after_why_question = templates[f'{prompt_name}_{question_base_type}']['after_why'].format(e1=succeeding_event_idx+1, e2=preceding_event_idx+1)
    after_binary_question = templates[f'{prompt_name}_{question_base_type}']['after_binary'].format(e1=succeeding_event_idx+1, e2=preceding_event_idx+1)

    return before_why_question, before_binary_question, after_why_question, after_binary_question

def write_questions_to_file(data, output_dir, question_type):
    fp = open(f'{output_dir}/{question_type}.jsonl', 'w+')
    for row in data:
        fp.write(json.dumps(row) + '\n')
    fp.close()

def process_plans(df, prompt_name, output_dir):

    dependent_real_before, dependent_real_after = [], []
    nondependent_real_before, nondependent_real_after = [], []
    nondependent_switched_real_before, nondependent_switched_real_after = [], []

    for idx, row in df.iterrows():
        preceding_event_idx = row['steps'].index(row['destination_text'])
        succeeding_event_idx = row['steps'].index(row['origin_text'])

        if row['label']:
            question_type_prefix = 'dependent_real'
        else:
            question_type_prefix = 'nondependent_real'

        before_why_question, before_binary_question, after_why_question, after_binary_question = create_questions(row['steps'], preceding_event_idx, succeeding_event_idx, prompt_name, question_type_prefix.split('_')[0])

        before_info = {'plan_idx': idx, 'original_file_row_no': row['counter'], 'title': row['title'], 'question_idx': idx, 'steps': row['steps'], 'question_type': f'{question_type_prefix}_before', 'step_pair_idx_asked_about': (preceding_event_idx, succeeding_event_idx), 'binary_question': before_binary_question, 'why_question': before_why_question}

        after_info = {'plan_idx': idx, 'original_file_row_no': row['counter'], 'title': row['title'], 'question_idx': idx, 'steps': row['steps'], 'question_type': f'{question_type_prefix}_after', 'step_pair_idx_asked_about': (preceding_event_idx, succeeding_event_idx), 'binary_question': after_binary_question, 'why_question': after_why_question}

        if row['label']:
            dependent_real_before.append(before_info)
            dependent_real_after.append(after_info)
        else:
            nondependent_real_before.append(before_info)
            nondependent_real_after.append(after_info)

            question_type_prefix = 'nondependent_switched_real'

            plan_steps = list(row['steps'])
            plan_steps[preceding_event_idx], plan_steps[succeeding_event_idx] = plan_steps[succeeding_event_idx], plan_steps[preceding_event_idx]

            before_info = {'plan_idx': idx, 'original_file_row_no': row['counter'], 'title': row['title'], 'question_idx': idx, 'steps': plan_steps, 'question_type': f'{question_type_prefix}_before', 'step_pair_idx_asked_about': (preceding_event_idx, succeeding_event_idx), 'binary_question': before_binary_question, 'why_question': before_why_question}

            after_info = {'plan_idx': idx, 'original_file_row_no': row['counter'], 'title': row['title'], 'question_idx': idx, 'steps': plan_steps, 'question_type': f'{question_type_prefix}_after', 'step_pair_idx_asked_about': (preceding_event_idx, succeeding_event_idx), 'binary_question': after_binary_question, 'why_question': after_why_question}

            nondependent_switched_real_before.append(before_info)
            nondependent_switched_real_after.append(after_info)

    print(f'Total questions created: {len(dependent_real_before) + len(dependent_real_after) + len(nondependent_real_before) + len(nondependent_real_after) + len(nondependent_switched_real_before) + len(nondependent_switched_real_after)}')

    write_questions_to_file(dependent_real_before, output_dir, 'dependent_real_before')
    write_questions_to_file(dependent_real_after, output_dir, 'dependent_real_after')

    write_questions_to_file(nondependent_real_before, output_dir, 'nondependent_real_before')
    write_questions_to_file(nondependent_real_after, output_dir, 'nondependent_real_after')

    write_questions_to_file(nondependent_switched_real_before, output_dir, 'nondependent_switched_real_before')
    write_questions_to_file(nondependent_switched_real_after, output_dir, 'nondependent_switched_real_after')

def main(args):
    data = read_flowgraph_data_file(args.input_file)
    Path(args.output_dir).mkdir(parents=True, exist_ok=True)
    process_plans(data, args.prompt_name, args.output_dir)

if __name__ == '__main__':
    args = parse_args()
    main(args)
