import json 
import pandas as pd 
import numpy as np
import requests
from tqdm import tqdm
from multiprocessing import Pool
import subprocess
# PATH_TO_FP_QID_DATA="/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/cleaned_fp_data_wikidata_IDs.json"
PATH_TO_UNIQUE_ENTS="/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/instance_type_entities_comp_people.json"
PATH_TO_ALL_ENT_DF="/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/all_ents_wdata_comp_people.csv"
PATH_TO_FP_DATA="/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/cleaned_fp_data.json"
SOTU_PREDICTIONS_DF="/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/asymmetric_disambiguation_full_100/predictions/newspapers_sotu/predictions.csv"

def add_mention_info(template,title):
    men_start=0
    men_end=len(title)
    
    assert template[men_start:men_end]==title
    
    return {"mention_start":0, "mention_end":men_end,"mention_text":title}

def create_template(title, instance_of, aliases):
    """
    Creates a formatted string that describes an entity with its title, types, and aliases in a more natural English format.
    
    Parameters:
        title (str): The title of the entity.
        instance_of (list): A list of types or categories of the entity.
        aliases (list): A list of other names or aliases for the entity.
    
    Returns:
        str: A formatted sentence describing the entity.
    """
    # Format the instance of list
    if instance_of:
        if len(instance_of) > 1:
            instance_of_formatted = ", ".join(instance_of[:-1]) + ", and " + instance_of[-1]
        else:
            instance_of_formatted = instance_of[0]
    else:
        instance_of_formatted = "unknown type"

    # Format the aliases list
    if aliases:
        if len(aliases) > 1:
            alias_list = ", ".join(aliases[:-1]) + ", and " + aliases[-1]
        else:
            alias_list = aliases[0]
    else:
        alias_list = []

    if len(alias_list)>0 and len(instance_of_formatted)>0:
        return f"{title} is of type {instance_of_formatted}. Also known as {alias_list}."
    elif len(alias_list)==0 and len(instance_of_formatted)>0:
        return f"{title} is of type {instance_of_formatted}."
    elif len(alias_list)>0 and len(instance_of_formatted)==0:
        return f"{title} is also known as {alias_list}."
    else:
        return f"{title}."

def replace_qids_with_labels(qids):
    return [ent_labels[qid] for qid in qids if qid in ent_labels and ent_labels[qid] is not None]

def get_ent_label(entity):
    url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={entity}&format=json"
    response = requests.get(url)
    data = response.json()
    try:
        return data['entities'][entity]['labels']['en']['value']
    except:
        return None

# ###For each of the unique entities, get the wikidata label for the entity. 
with open(PATH_TO_UNIQUE_ENTS, 'r') as f:
    ents = json.load(f)




# ##Use multiprocessing to get the labels for all entities - have a tqdm progress bar
# pool = Pool(3)

# ###Use multiprocessing to get the labels for all entities with a progress bar
# ent_labels = list(tqdm(pool.imap(get_ent_label, ents), total=len(ents)))



# ##Save entity labels to a file
# with open("entity_labels.json", 'w') as f:
#     json.dump(ent_labels, f)


#Load the df

# ##InstanceOfLabels is a python list - load it as such - from string to python list (eval)
# df=pd.read_csv(PATH_TO_ALL_ENT_DF)
# df['InstanceOfLabels'] = df['InstanceOfLabels'].apply(eval)
# df['Aliases'] = df['Aliases'].apply(eval)
# print(df.head())

# print(df.columns)

# ###now, InstanceOfLabels is a list of qids. Replace the corresponding qids with the labels
# with open("entity_labels.json", 'r') as f:
#     ent_labels = json.load(f)


# ###For each of the unique entities, get the wikidata label for the entity. 
# with open(PATH_TO_UNIQUE_ENTS, 'r') as f:
#     ents = json.load(f)

# ##Both the labels and the qids are in the same order. So, make a dict with qid as key and label as value
# ent_labels = dict(zip(ents, ent_labels))
    


# df['InstanceOfLabels'] = df['InstanceOfLabels'].apply(replace_qids_with_labels)


# ##Make template - use EngWikipediaTitle for title, InstanceOfLabels for instance_of, and Aliases for aliases
# df['template'] = df.apply(lambda x: create_template(x['EngWikipediaTitle'], x['InstanceOfLabels'], x['Aliases']), axis=1)


# #Save the df
# df.to_csv("/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/all_ents_wdata_with_templates.csv", index=False)

##Open df
# df=pd.read_csv("/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/all_ents_wdata_with_templates.csv",
#                 converters={"InstanceOfLabels":eval, "Aliases":eval})


# ##Drop those rows where EngWikipediaTitle is nan
# df.dropna(subset=['EngWikipediaTitle'], inplace=True)

# # print(df.head())


# # ##Add templates to the first para data
# with open(PATH_TO_FP_DATA, 'r') as f:
#     first_para_data = json.load(f)
    
# ##Set QID as index of the df
# df.set_index('EngWikipediaTitle', inplace=True)

# ###Check how many entities in the first para data are not in the df
# fp_entities=set(first_para_data.keys())
# df_entities=set(df.index)
# intersection=fp_entities.intersection(df_entities)



##Check how many sotu entities are missing in the first paras
sotu_predictions=pd.read_csv(SOTU_PREDICTIONS_DF)
sotu_entities=set(sotu_predictions['gt_entity_label'].values)    

# print(len(sotu_entities.difference(fp_entities)),"entities in the sotu predictions are not in the first para data")
# print(sotu_entities.difference(fp_entities))
##Print entities that are in the sotu predictions but not in the first para data
# for entity in sotu_entities.difference(fp_entities):
#     print(entity)
#     print(sotu_predictions[sotu_predictions['gt_entity_label']==entity])


##Add the template, mention info to the first para data
##Format: {'wiki_title': 'Bob Dole 1996 presidential campaign', 'text': 'Bob Dole 1996 presidential campaign is of type presidential campaign. Also known as Bob Dole 1996 presidential campaign, and 1996 Bob Dole presidential campaign. The 1996 presidential campaign of Bob Dole began when Republican Senator Bob Dole formally announced his candidacy for Republican Party nomination in 1995. After beating other candidates in the primaries, he became the Republican nominee, with his challenger being Democratic incumbent President Bill Clinton in the 1996 presidential election. Dole conceded defeat in the race in a telephone call to Clinton on November 5, 1996.==', 'wikidata_info': {'wikidata_id': 'Q107058306', 'instance_of_labels': ['presidential campaign'], 'aliases': ['Bob Dole 1996 presidential campaign', '1996 Bob Dole presidential campaign']}, 'template': 'Bob Dole 1996 presidential campaign is of type presidential campaign. Also known as Bob Dole 1996 presidential campaign, and 1996 Bob Dole presidential campaign.', 'mention_start': 0, 'mention_end': 35, 'mention_text': 'Bob Dole 1996 presidential campaign'}
# formatted_dict = {}
# for entity in tqdm(intersection):
    
#     relevant_row=df.loc[entity]

    
#     qid=relevant_row['QID']
#     if type(qid)!=str:
#         continue
   
#     formatted_dict[qid] = {}
    
#     ##title is an index now, so get it from the index
#     wiki_title = entity

#     formatted_dict[qid]['wiki_title'] = wiki_title
#     ##Template
#     template = relevant_row['template']
#     ##Add mention info'


#     mention_info = add_mention_info(template, wiki_title)
#     ##Add the template, mention info to the dict
#     formatted_dict[qid]['template'] = template
#     formatted_dict[qid]['mention_start'] = mention_info['mention_start']
#     formatted_dict[qid]['mention_end'] = mention_info['mention_end']
#     formatted_dict[qid]['mention_text'] = mention_info['mention_text']
    
#     ##Add the text - template + value of the first para
#     text = template + first_para_data[entity]
    
#     formatted_dict[qid]['text'] = text
#     ##Add wikidata info
#     formatted_dict[qid]['wikidata_info'] = {}
#     formatted_dict[qid]['wikidata_info']['wikidata_id'] = qid
#     formatted_dict[qid]['wikidata_info']['instance_of_labels'] = relevant_row['InstanceOfLabels']
#     formatted_dict[qid]['wikidata_info']['aliases'] = relevant_row['Aliases']


# print("Number of entities in the first para data also in wikidata: ", len(formatted_dict))


##Save the formatted dict
# with open("/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/formatted_first_para_data_qid_template_with_missing_ents.json", 'w') as f:
#     json.dump(formatted_dict, f)
    
###Open the formatted dict
# with open("/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/formatted_first_para_data_qid_template_with_missing_ents.json", 'r') as f:
#     formatted_dict = json.load(f)
    
# ##For missing entities, check if a page exists in wikipedia as one of the aliases
# ##If it does, add the page to the formatted dict
# print("Attempting to fill up missing entities with wikipedia pages as aliases...")
# missing_entities=df_entities.difference(fp_entities)
# extra_matched_fp_data_entities=[]
# for entity in tqdm(missing_entities):
    
    
    
#     relvant_df_row=df.loc[entity]
#     entity_aliases=relvant_df_row['Aliases']
#     qid=relvant_df_row['QID']
#     if type(qid)!=str:
#         continue
#     instances=relvant_df_row['InstanceOfLabels']
    
#     if len(entity_aliases)==0:
#         continue
    
#     if set(entity_aliases).intersection(fp_entities)==0:
#         continue
    
#     for alias in entity_aliases:
#         if alias in fp_entities:
#             ##Add the entity to the formatted dict
#             formatted_dict[qid] = {}
#             ##Get the first para data key as title
#             formatted_dict[qid]['wiki_title'] = alias
#             ##Template
#             template = create_template(alias, instances, entity_aliases)

#             ##Add mention info'
#             mention_info = add_mention_info(template, alias)
#             ##Add the template, mention info to the dict
#             formatted_dict[qid]['template'] = template
#             formatted_dict[qid]['mention_start'] = mention_info['mention_start']
#             formatted_dict[qid]['mention_end'] = mention_info['mention_end']
#             formatted_dict[qid]['mention_text'] = mention_info['mention_text']
#             ##Add the text - template + value of the first para
#             text = template + first_para_data[alias]
#             formatted_dict[qid]['text'] = text
#             ##Add wikidata info
#             formatted_dict[qid]['wikidata_info'] = {}
#             formatted_dict[qid]['wikidata_info']['wikidata_id'] = qid
#             formatted_dict[qid]['wikidata_info']['instance_of_labels'] = []
#             formatted_dict[qid]['wikidata_info']['aliases'] = []
#             extra_matched_fp_data_entities.append(qid)
            
#             break
        
# print("Number of missing entities filled up with wikipedia pages as aliases: ", len(extra_matched_fp_data_entities))
            

        

        
        

# ##Save the formatted dict
# with open("/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/formatted_first_para_data_qid_template_people.json", 'w') as f:
#     json.dump(formatted_dict, f)

##Open 
with open("/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/formatted_first_para_data_qid_template_people.json", 'r') as f:
    formatted_dict = json.load(f)

##Check which qids in sotu are missing in the formatted dict
sotu_qids=set(sotu_predictions['ground_truth'].values)
print("Number of sotu qids missing in the formatted dict: ", len(sotu_qids.difference(set(formatted_dict.keys()))))
print(sotu_qids.difference(set(formatted_dict.keys())))

###check number of qids in the formatted dict
print(formatted_dict["Q449356"])

print(formatted_dict["Q2042"])
# print(len(formatted_dict))
    
# rclone copy -P /mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/formatted_first_para_data_qid_template.json dell_db:entity_training/
##run this command to copy the file to the dell_db entity_training folder
 
    
# subprocess.run(["rclone", "copy", "-P", "/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/formatted_first_para_data_qid_template.json", "dell_db:entity_training/"])
    