source("util.R")
library(lubridate)
######################################################################
######################################################################
# Descriptives
######################################################################
######################################################################
survey_data <- fread("handles_1_16_withResponses.csv")

print(paste("N Survey Handles: ", length(unique(survey_data$handle))))

basic_user <- read_simple_user_info_w_nulls("all_survey_basic.tsv")
basic_user[, user_screenname := tolower(user_screenname)]

survey_basic_data <- merge(survey_data,
                           basic_user, 
                           by.x="handle",
                           by.y="user_screenname")
nrow(survey_data)
nrow(basic_user)
nrow(survey_basic_data)

print(paste("N Survey Handles Active 5/1/20: ", 
            length(unique(survey_basic_data$uid))))

uid_sn_tweet_count <- fread("tweets_per_survey_user.tsv")
uid_sn_tweet_count[, user_screenname := tolower(user_screenname)]
table(uid_sn_tweet_count[,length(unique(user_screenname)),by=userid]$V1)
# toss people with handles linked to > 1 account
table(uid_sn_tweet_count[,length(unique(userid)),by=user_screenname]$V1)
uid_sn_tweet_count <- uid_sn_tweet_count[ 
  user_screenname %in% 
    uid_sn_tweet_count[,length(unique(userid)),
                       by=user_screenname][V1 == 1]$user_screenname
]

uid_tweet_count <- uid_sn_tweet_count[,
                                      list(n_tweets_collected = sum(count)), 
                                      by=.(userid)]

fin_survey <- merge(survey_basic_data,
                    uid_tweet_count,
                    by.x="uid",
                    by.y="userid",
                    all.x=T)
fin_survey[is.na(n_tweets_collected)]$n_tweets_collected <- 0



# Most of these really didn't look like people, drop
fin_survey <- fin_survey[followers_count < 100000]

print(paste("N Survey Users w/ at least 1 tweet: ", 
            length(unique(fin_survey[n_tweets_collected > 0]$uid))))

########################## VAX ################################

surv_tweet <- rbind(get_surv_tweet(fin_survey,VAX_CONFIG),
                    get_surv_tweet(fin_survey,TRUMP_CONFIG),
                    get_surv_tweet(fin_survey,LOCKDOWN_CONFIG),
                    get_surv_tweet(fin_survey,MASK_CONFIG))


tweets_to_sample <- rbind(get_tweets(fin_survey,VAX_CONFIG),
                          get_tweets(fin_survey,TRUMP_CONFIG),
                          get_tweets(fin_survey,LOCKDOWN_CONFIG),
                          get_tweets(fin_survey,MASK_CONFIG))


write.csv(tweets_to_sample,"tweets_to_sample.csv",row.names=F)


fin_survey <- get_trump_stance(fin_survey,remove_users = F)
setnames(fin_survey , "survey_stance", "trump_stance")
fin_survey <- get_mask_stance(fin_survey)
setnames(fin_survey , "survey_stance", "mask_stance")
fin_survey <- get_lockdown_stance(fin_survey)
setnames(fin_survey , "survey_stance", "lockdown_stance")
fin_survey <- get_vax_stance(fin_survey)
setnames(fin_survey , "survey_stance", "vaccine_stance")

######################################################################
#### Construct data ###############
######################################################################

############ Get annotations
a1 <- get_responses("a1","data/a1.csv")
a2 <- get_responses("a2","data/a2.csv")
a3 <- get_responses("a3","data/a3.csv")
a4 <- get_responses("a4","data/a4.csv")
a5 <- get_responses("a5","data/a5.csv")
dat <- rbind(a1,a2,a3,a4,a5)
# something weird with qualtrics on one tweet
dat <- dat[tid != 1314031460215648256]
############ Disagreements resolved
d1 <- get_responses("d1","~/Downloads/disagree_kenny_May 12, 2021_09.36.csv")
d2 <- get_responses("d2","~/Downloads/disagree_jon_May 12, 2021_09.36.csv")
d3 <- get_responses("d3","~/Downloads/disagree_alexi_May 12, 2021_18.24.csv")
disagree_final <- rbind(d1,d2,d3)
disagree_final[,uid := as.integer64(uid)]
disagree_final <- disagree_final[tid != 1314031460215648256]
################ final dataset
sp <- spread(dat[,.(annotator,tid,uid,target,stance)],annotator, stance)
sp <- merge(sp, dat[,get_c(confidence),by=.(tid,target)], by=c("tid","target"))
sp[, confidence_integer := ifelse(a1_conf == "Very",2,ifelse(a1_conf=="Somewhat",1,0)) + 
     ifelse(a2_conf == "Very",2,ifelse(a2_conf=="Somewhat",1,0))]
sp[, any_notatall_conf := a1_conf == "Not at all" | a2_conf == "Not at all"]


agree_basic<- rbind(sp[!is.na(a1)& !is.na(a2) & a1==a2],
                    sp[!is.na(a1)& !is.na(a3) & a1==a3],
                    sp[!is.na(a1)& !is.na(a4) & a1==a4],
                    sp[!is.na(a1)& !is.na(a5) & a1==a5],
                    sp[!is.na(a2)& !is.na(a3) & a2==a3],
                    sp[!is.na(a2)& !is.na(a4) & a2==a4],
                    sp[!is.na(a2)& !is.na(a5) & a2==a5],
                    sp[!is.na(a3)& !is.na(a4) & a3==a4],
                    sp[!is.na(a3)& !is.na(a5) & a3==a5],
                    sp[!is.na(a4)& !is.na(a5) & a4==a5])
agree_basic[, fin_ann := ifelse(!is.na(a1),a1,
                                ifelse(!is.na(a2),a2,
                                       ifelse(!is.na(a3),a3,a4)))]
agree_basic[, has_disagreement := F]
agree_basic[,uid := as.integer64(uid)]
disagree_basic<- rbind(sp[!is.na(a1)& !is.na(a2) & a1!=a2],
                       sp[!is.na(a1)& !is.na(a3)    & a1!=a3],
                       sp[!is.na(a1)& !is.na(a4)    & a1!=a4],
                       sp[!is.na(a1)& !is.na(a5)    & a1!=a5],
                       sp[!is.na(a2)& !is.na(a3)    & a2!=a3],
                       sp[!is.na(a2)& !is.na(a4)    & a2!=a4],
                       sp[!is.na(a2)& !is.na(a5)    & a2!=a5],
                       sp[!is.na(a3)& !is.na(a4)    & a3!=a4],
                       sp[!is.na(a3)& !is.na(a5)    & a3!=a5],
                       sp[!is.na(a4)& !is.na(a5)    & a4!=a5])
disagree_basic[,uid := as.integer64(uid)]
disagree_basic <- merge(disagree_basic,
                        disagree_final[,-c("confidence","response","annotator")], 
                        by=c("tid","target","uid"))
disagree_basic[, has_disagreement := T]
setnames(disagree_basic, "stance","fin_ann")

final_data <- rbind(agree_basic,disagree_basic)
final_data[, id_str := as.integer64(tid)]
final_data[, uid := as.integer64(uid)]
final_data <- merge(final_data,
                    tweets_to_sample[, -"survey_stance"],by=c("target","id_str"))

final_data <- merge(final_data,
                    fin_survey[,-"created_at"],
                    by="uid")

final_data[, surv := ifelse(target=="Trump", reverse_get_stance(trump_stance),
                            ifelse(target=="Masks",reverse_get_stance(mask_stance),
                                   ifelse(target=="Lockdowns",reverse_get_stance(lockdown_stance),
                                          reverse_get_stance(vaccine_stance))))]
final_data <- final_data[!is.na(surv)]

final_data[, fin_ann := factor(fin_ann,levels=c("Anti","Neutral","Pro"))]
final_data[, surv := factor(surv,levels=c("Anti","Neutral","Pro"))]
final_data[, surv_ann_agree := surv == fin_ann]
final_data[, createdat_date := as.POSIXct(created_at, format="%a %b %d %H:%M:%S +0000 %Y", tz="GMT")]

final_data[, list(t=length(unique(userid)), u=.N,n=length(unique(tid)))]

all_ann <- sp
f <- fin_survey[uid %in% fin_survey[uid %in% final_data$uid,.N, by=uid][N > 1]$uid]

f <- merge(f[,.(trump_stance,lockdown_stance,mask_stance,vaccine_stance,uid,survey_date)], 
           data.table(uid=unique(final_data$uid), anon_id=1:length(unique(final_data$uid))))
f$uid <- NULL
setnames(f, "anon_id","uid")
write.csv(f, "data/interwave_survey_data.csv")

final_data <- merge(final_data, data.table(uid=unique(final_data$uid), anon_id=1:length(unique(final_data$uid))))
write.csv(final_data[,.(any_notatall_conf,surv,fin_ann,target,a1_conf,a2_conf,anon_id,survey_date, createdat_date,ideology,surv_ann_agree)],
          "data/anon_final_data.csv")

write.csv(sp[, -c("tid","uid")], "data/all_annotated_info.csv")

