## python3 control-analysis.py --year 2020 --month mar --demographic_type female
import logging
import random
from tqdm import tqdm
from transformers import pipeline
import argparse

logging.basicConfig(level=logging.DEBUG,
					format='%(asctime)s - %(levelname)s - %(message)s',
					datefmt='%d-%b-%y %H:%M:%S')

def parse_args():
	parser = argparse.ArgumentParser()
	parser.add_argument('--year', type=int, required=True, default=2020,
						help='The year that the corpus collected from')
	parser.add_argument('--month', type=str, required=True, default='mar',
					 choices=['mar', 'jun', 'sep', 'dec'],
						help='The month that the corpus collected from')
	parser.add_argument('--demographic_type', type=str, required=True, default='female', 
					 choices=['female', 'male', 'christian', 'jewish', 'muslim', 'hindu', 'buddhist', 'black', 'white', 'asian', 'latino', 'young', 'old'], help='The religion type')
	args = parser.parse_args()
	return args

if __name__ == '__main__':

	args = parse_args()
	corpus_text = []
	p_count = 0
	n_count = 0
	tweet_count = 0
	year = args.year
	if args.month == 'mar':
		months = ['01','02','03']
	elif args.month == 'jun':
		months = ['04','05','06']
	elif args.month == 'sep':
		months = ['07','08','09']
	elif args.month == 'dec':
		months = ['10', '11', '12']
	logging.info("Loading tweets ...")
	f = open(f"./data/tweets-{year}.jl", "r")
	for line in f:
		line_dict = eval(line)
		tweet_text = line_dict['text'].lower()
		tweet_time = line_dict['created_at']
		time_stamps = [f'{year}-{month}' for month in months]
		if any(time_stamp in tweet_time for time_stamp in time_stamps):
			corpus_text.append(tweet_text.encode('utf-16','surrogatepass').decode('utf-16'))
		else:
			continue
	corpus_text = list(corpus_text)
	sample_n = len(corpus_text) // 5
	corpus_text = random.sample(corpus_text, k=sample_n)

	demographic_list = []
	with open(f"./data/{args.demographic_type}_list.txt") as demographic_file:
		for line in demographic_file:
			demographic_list.append(line.lower().strip())

	model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"	
	sentiment_pipeline = pipeline(model=model_path, tokenizer=model_path, max_length=512, truncation=True)

	for text in tqdm(corpus_text):
		tweet_count += 1
		text = text.lower()
		
		for word in demographic_list:
			if word in text:
				sentiment_label = sentiment_pipeline(text)[0]['label']
				if sentiment_label == 'positive':
					p_count += 1
				elif sentiment_label == 'negative':
					n_count += 1

	print('positive_count: ', p_count, 'negative_count: ', n_count, 'tweet_count: ', tweet_count)


