import pandas as pd
import numpy as np
import json
from tqdm import tqdm

PATH_TO_WIKIDATA_PEOPLE_DF="/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/all_ents_wdata_comp_people.csv"
df_wikidata_people = pd.read_csv(PATH_TO_WIKIDATA_PEOPLE_DF,converters={'PersonalInfo': eval})

##PErsonal info is a dictionary. 
##Get unique values of the keys
##We only care about father, mother, sibling, spouse, child
##Create a new column for each of these keys

for key in ['father','mother','sibling','spouse','child']:
    df_wikidata_people[key] = df_wikidata_people['PersonalInfo'].apply(lambda x: x.get(key, np.nan))
    
##Keep only if any of these keys are not nan
df_wikidata_people = df_wikidata_people.dropna(subset=['father','mother','sibling','spouse','child'], how='all')

###We want to make a dict of the form qid:[father, mother, sibling, spouse, child]
dict_qid = {}
for index, row in tqdm(df_wikidata_people.iterrows()):
    father=row['father'].get('id',np.nan) if row['father'] is not np.nan else np.nan
    mother=row['mother'].get('id',np.nan) if row['mother'] is not np.nan else np.nan
    sibling=row['sibling'].get('id',np.nan) if row['sibling'] is not np.nan else np.nan
    spouse=row['spouse'].get('id',np.nan) if row['spouse'] is not np.nan else np.nan
    child=row['child'].get('id',np.nan) if row['child'] is not np.nan else np.nan
    dict_qid[row['QID']] = [father, mother, sibling, spouse, child]
    
dict_qid = {k: [i for i in v if i is not np.nan] for k,v in dict_qid.items()}

##Make list of sets now - each set will contain all the qids of the people who are related to each other - no dups
##For example, we don't want father : [child] and child :father. It is better to have [father, child]

# family_sets={}
# for qid, relations in tqdm(dict_qid.items()):
#     family_list = []
#     for rel in relations:
#         if rel in dict_qid:
#             family_list.add(rel)
        
#     family_sets.append(list(family_set))
    

##Save 
with open('/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/qid_family_neg_dict.json', 'w') as fp:
    json.dump(dict_qid, fp)
    