#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
12/6/2025
Author: Katarina
"""

import math
import pandas as pd
import json, os, re
import pickle as p

def get_channel_langs():
    # open a .csv with the languages of the channels
    channel_langs = pd.read_csv('final_sample.csv')
    short_forms = {'italian' : 'it', 'dutch' : 'nl', 'english' : 'en'}
    languages = [x.lower() for x in channel_langs['language']]
    channel_langs = {x : short_forms[languages[i]] for i, x in enumerate(channel_langs['username']) if languages[i] in short_forms}
    return(channel_langs)

def get_channel_types():
    channel_types = pd.read_csv('final_sample.csv')
    labels = list(channel_types['label']) # god of pandas forgive me for this but i'm really brain dead
    channel_types = {x : labels[i] for i, x in enumerate(channel_types['username'])}
    return(channel_types)

def first_counts():
    path = 'data/lemmas_emotions'
    sample_data = pd.read_csv('final_sample.csv')

    channel_langs = get_channel_langs()
    channel_types = get_channel_types()
    all_lemmas = {}
    raw_counts = {}

    for file in os.listdir(path):
        if not '.json' in file:
            continue
        with open(f'{path}/{file}', 'rb') as f:
            data = json.load(f)
        channel = re.sub('\.json$', '', file)
        lang = channel_langs[channel]
        label = channel_types[channel]
        if not lang in all_lemmas:
            all_lemmas[lang]= {}
            raw_counts[lang] = {}
        #if not label in all_lemmas[lang]:
        #    all_lemmas[lang][label] = []
        for post in data:
            post_lemmas = list(set(post[2]))
            for lemma in post_lemmas:
                if lemma in raw_counts[lang]:
                    raw_counts[lang][lemma] += 1
                else:
                    raw_counts[lang][lemma] = 1
            for emotion in post[1]:
                if not emotion in all_lemmas[lang]:
                    all_lemmas[lang][emotion] = []
                all_lemmas[lang][emotion] += (post_lemmas)
    return(all_lemmas, raw_counts)

def make_idfs(raw_counts):
    idfs = {}
    for lang in raw_counts:
        n_lemmas = len(raw_counts[lang])
        idfs[lang] = {}
        for lemma in raw_counts[lang]:
            lemma_count = raw_counts[lang][lemma]
            if lemma_count > n_lemmas:
                idfs[lang][lemma] = 0
            else:
                idfs[lang][lemma] = math.log(lemma_count/n_lemmas, 10)*-1
    return(idfs)

def make_tfs(all_lemmas):
    tfs = {}
    for lang in all_lemmas:
        print(lang)
        tfs[lang] = {}
        for emotion in all_lemmas[lang]:
            print(emotion)
            emotion_lemmas = set(all_lemmas[lang][emotion])
            n_lemmas = len(list(emotion_lemmas))
            tfs[lang][emotion] = {}
            if f'counts_{lang}_{emotion}.p' in os.listdir('tf_idfs'):
                print('found!')
                with open(f'tf_idfs/counts_{lang}_{emotion}.p', 'rb') as f:
                    counts = p.load(f)
            else:
                counts = {x : all_lemmas[lang][emotion].count(x) for x in emotion_lemmas}
                with open(f'tf_idfs/counts_{lang}_{emotion}.p', 'wb') as f:
                    p.dump(counts, f)
                print('dumped!')
            for lemma in counts:
                tfs[lang][emotion][lemma] = (counts[lemma] / n_lemmas)
    return(tfs)

def main():
    all_lemmas, raw_counts = first_counts()
    idfs = make_idfs(raw_counts)
    tfs = make_tfs(all_lemmas)
    with open('tf_idfs/idfs.p', 'wb') as f:
        p.dump(idfs, f)
    with open('tf_idfs/tfs.p', 'wb') as f:
        p.dump(tfs, f)

if __name__ == '__main__':
    main()
