import json
import pandas as pd

# File paths
PATH_TO_ALL_ENT_DF = "/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/all_ents_wdata_with_templates.csv"
PATH_TO_ALL_CONTEXTS = "/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/all_contexts.pkl"

# Load entity data
all_ents_df = pd.read_csv(PATH_TO_ALL_ENT_DF, converters={'Aliases': eval})
all_ents_df = all_ents_df[['QID', 'Label', 'Aliases', 'EngWikipediaTitle']].dropna(subset=['EngWikipediaTitle'])
all_ents_df.set_index('EngWikipediaTitle', inplace=True)

# Load contexts
all_contexts = pd.read_pickle(PATH_TO_ALL_CONTEXTS)
context_keys = set(all_contexts.keys())

# Mapping Wikipedia titles to QIDs
all_ents_df['Context'] = all_ents_df.index.map(all_contexts.get)
all_ents_df.dropna(subset=['Context'], inplace=True)

# Mapping remaining titles using aliases
def map_aliases_to_context(row):
    for alias in row['Aliases']:
        if alias in all_contexts:
            return all_contexts[alias]
    return None

# Apply only to rows where Context is still NaN
missing_contexts = all_ents_df['Context'].isna()
all_ents_df.loc[missing_contexts, 'Context'] = all_ents_df[missing_contexts].apply(map_aliases_to_context, axis=1)

# Filter to only rows with valid contexts
final_df = all_ents_df.dropna(subset=['Context'])

# Convert to dictionary for saving
all_contexts_qid = final_df.set_index('QID')['Context'].to_dict()

# Save the context dictionary
output_path = "/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/entity_training/all_contexts_qid_people.json"
with open(output_path, "w") as f:
    json.dump(all_contexts_qid, f)

print("Titles processed:", len(final_df))
