import pandas as pd 
import json
from tqdm import tqdm
QRANK_CSV="/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/qrank.csv.gz"
PATH_TO_INF_CORPUS="/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/formatted_first_para_data_qid_template_people_with_median_dates.json"
PATH_TO_UPDATED_INF_CORPUS="/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/formatted_first_para_data_qid_template_people_with_qrank.json"
##Load the qrank csv
qrank_df=pd.read_csv(QRANK_CSV,compression='gzip')


##Assign qid (Entity) as index
qrank_df.set_index('Entity', inplace=True)

##Drop if QRank is NaN
qrank_df.dropna(subset=['QRank'], inplace=True)

##Open the inference corpus
with open(PATH_TO_INF_CORPUS) as f:
    inf_corpus=json.load(f)
    
##Qid is the key - add that in each value dict as "qrank"
for qid, data in tqdm(inf_corpus.items()):
    data['qrank'] = int(qrank_df.loc[qid, 'QRank']) if qid in qrank_df.index else None

###Check what percentage of qids have qrank
qrank_count=0
for qid, data in inf_corpus.items():
    if data['qrank'] is not None:
        qrank_count+=1

print(f"Percentage of qids with qrank: {qrank_count/len(inf_corpus)*100}")



##Save the updated inference corpus
with open(PATH_TO_UPDATED_INF_CORPUS, 'w') as f:
    json.dump(inf_corpus, f)


