import json
from tqdm import tqdm


def clean_text(text):
    if text:
        return text.replace('*', '')
    else:
        return text


def clean_one_file(file_name):
    with open(file_name, encoding='utf-8') as f:
        a = json.load(f)
        for scene in a:
            for entry in scene['entries']:
                entry['description'] = clean_text(entry['description'])
                for card in entry['cards']:
                    card['name'] = clean_text(card['name'])
                    card['description'] = clean_text(card['description'])
            character_dict = scene['character']
            character_dict['name'] = clean_text(character_dict['name'])
            character_dict['description'] = clean_text(character_dict['description'])
            for card in character_dict['cards']:
                card['name'] = clean_text(card['name'])
                card['description'] = clean_text(card['description'])
            for card in scene['extra_cards']:
                card['name'] = clean_text(card['name'])
                card['description'] = clean_text(card['description'])

    with open(file_name, 'w', encoding='utf-8') as out_f:
        json.dump(a, out_f, indent=1, ensure_ascii=False)
    print('finish clean file', file_name)


def clean_conceptnet(in_file_name='../ConceptNet/conceptnet_triple_weight.csv', out_file_name='../ConceptNet/conceptnet_cleaned_weight.csv',
                     word_file_name='../vocab/processed_vocab.txt', high_frequency_word_file_name=None):
    need_words = []
    high_frequency_words = []

    with open(word_file_name, encoding='utf-8') as f:
        lines = f.readlines()
        for line in tqdm(lines):
            need_words.append(line.strip())

    if high_frequency_word_file_name:
        with open(high_frequency_word_file_name, encoding='utf-8') as f:
            lines = f.readlines()
            for line in tqdm(lines):
                high_frequency_words.append(line.strip())

    need_words = set(need_words)
    high_frequency_words = set(high_frequency_words)

    with open(in_file_name, encoding='utf-8') as f:
        a = f.readlines()
        res = []
        for line in tqdm(a):
            words = line.strip().split('|||')
            if '_' in words[0] or '_' in words[-2]:
                continue
            if words[0] not in need_words or words[-2] not in need_words:
                continue
            if words[0] in high_frequency_words or words[-2] in high_frequency_words:
                continue

            res.append(words[0] + '|||' + words[1] + '|||' + words[-2] + '|||' + words[-1])

    with open(out_file_name, 'w', encoding='utf-8') as f:
        for line in tqdm(res):
            f.write(line + '\n')


def clean_conceptnet_weight(in_file_name='../ConceptNet/conceptnet_cleaned_weight.csv', out_file_name='../ConceptNet/conceptnet_allvocab_cleaned_final.txt'):
    with open(in_file_name, encoding='utf-8') as f:
        a = f.readlines()
        from collections import defaultdict
        weights = defaultdict(int)
        res = []
        for line in a:
            words = line.strip().split('|||')
            weight = words[-1]
            if float(weight) > 1.0:
                res.append(line)

            # from math import ceil
            # weights[ceil(float(weight))] += 1

        # print(weights)
    with open(out_file_name, 'w', encoding='utf-8') as f:
        for line in res:
            f.write(line)


if __name__ == '__main__':
    # clean_one_file('test.json')
    # clean_one_file('valid.json')
    # clean_one_file('train.json')
    clean_conceptnet()
    clean_conceptnet_weight()
