#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
8/11/2024
Author: Katarina
"""

import requests
import os
import pandas as pd
import pickle as p
import re
from deep_translator import GoogleTranslator
import time

def fetch_wikidata(parameters):
    url = 'https://www.wikidata.org/w/api.php'
    try:
        return(requests.get(url, params = parameters).json())
    except:
        print('Request failed. Putting the program to sleep.')
        time.sleep(2)
        return(None)

def get_channel_langs():
    # open a .csv with the languages of the channels
    sample = pd.read_csv('final_sample.csv')
    sample['language'] = sample['username'].apply(lambda x : 'it' if x == 'Italian' else 'en')
    channel_languages = {x : list(sample['language'])[i] for i, x in enumerate(list(sample['username']))}
    return(channel_languages)

def is_passable_entity(entity):
    if not 'score' in entity:
        return(True)
    if entity['entity_group'] == 'MISC':
        if entity['score'] < 0.75:
            return(False)
    elif entity['score'] < 0.7:
        return(False)
    return(True)


def main():
    channel_langs = get_channel_langs()
    in_directory = 'data/4_preprocessed_entities'
    out_directory = 'data/5_raw_wikidata'

    for file in os.listdir(in_directory):
        if re.search('.p$', file) == None or file in os.listdir(out_directory):
            continue
        if file in os.listdir('data/sample/data'):
            print(f'skipped {file}!')
            continue
        with open(f'{in_directory}/{file}', 'rb') as f:
            channel_data = p.load(f)

        wikidata_lookup = {}
        channel = re.sub('\.p$', '', file)
        print(channel)
        channel_lang = channel_langs[channel]
        if channel_lang == 'en':
            error_lang = 'es'
        else:
            error_lang = 'en'

        parameters = {'action' : 'wbsearchentities',
                    'format' : 'json',
                    'language' : channel_lang}
                    #'errorlang' : error_lang}

        channel_wikidata = []
        for i, prep_entities in enumerate(channel_data['preprocessed_entities']):
            if i % 500 == 0 and not i == 0:
                print(f"\t\t{i} out of {len(channel_data['raw_entities'])} done!")
            if len(wikidata_lookup) > 2500:
                wikidata_lookup = {}
            paragraph = channel_data['post'][i]
            paragraph_wikidata = {}
            queries = [x['word'].lower() for x in prep_entities if is_passable_entity(x)]
            queries = list(set(queries))
            for query in queries:
                if query in wikidata_lookup:
                    paragraph_wikidata[query] = wikidata_lookup[query]
                else:
                    parameters['search'] = query
                    search_results = fetch_wikidata(parameters)
                    paragraph_wikidata[query] = search_results
                    wikidata_lookup[query] = search_results
            channel_wikidata.append(paragraph_wikidata)
        channel_data['raw_wikidata'] = channel_wikidata
        with open(f'{out_directory}/{file}', 'wb') as f:
            p.dump(channel_data, f)

if __name__ == '__main__':
    main()
