#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
10/4/2025
Author: Katarina
takes the channel data
extracts the entities and puts for each post the entities in an index matched dict
"""

import os, re, json
import pandas as pd
import pickle as p

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from nltk.tokenize import sent_tokenize

tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
ner_model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
ner = pipeline("ner", model=ner_model, tokenizer=tokenizer, grouped_entities=True)

def extract_raw_entities(text):
    # returns a list of entities
    entities = []
    try:
        found_entities = ner(text)
    except:
        print(f'Couldn\'t process the following text: {text}')
        return(entities)
    # if this works, we loop over the entities
    for ent in found_entities: # this is a dictionary
        entities.append(ent)
    return(entities)


def main():
    channels = pd.read_csv('final_sample.csv')
    channels = list(channels['username'])
    in_directory = 'data/2_preprocessed/final_small'
    out_directory = 'data/3_raw_entities'
    for i, channel in enumerate(channels):
        if not f'{channel}.json' in os.listdir(in_directory):
            print(f'!!!{channel}')
            continue
        with open(f'{in_directory}/{channel}.json', 'r') as f:
            data = json.load(f)
        #channel_dict = {'post' : [], 'entities' : []}
        entities = []
        for post in data:
            par_entities = extract_raw_entities(post)
            entities.append(par_entities)
        channel_dict = {'post' : data,
                        'raw_entities' : entities}
        with open(f'{out_directory}/{channel}.p', 'wb') as f:
            p.dump(channel_dict, f)
        print(f'{i+1} out of {len(channels)} done!')

if __name__ == '__main__':
    main()
