
## Script to run recalibration and plot figures
library(readr)
setwd("Vidgen-etal-recalibration")
davidson <- readr::read_csv("Vidgen-etal-recalibration_Davidson-annotations.csv")
toxicity <- readr::read_csv("Vidgen-etal-recalibration_Perspective-annotations.csv")


## Data preparation
davidson = davidson[which(davidson$prob_hate!=0.5),] # optional step as Davidson bunches values at exactly 0.5
sortedtoxicity <- toxicity[order(toxicity$TOXICITY),]

D <- ifelse(sortedtoxicity$annotator_1 + sortedtoxicity$annotator_2 + sortedtoxicity$annotator_3 + sortedtoxicity$annotator_4 + sortedtoxicity$annotator_5 
            >=3,1,0)
E <- sortedtoxicity$TOXICITY

davidson$count <- davidson[,4,1] +  davidson[,5,1] +  davidson[,6,1] + davidson[,7,1] + davidson[,8,1]


####
## ROC CURVES
####
library(pROC)

pROCtoxicity <- pROC::roc(D,E,smoothed = TRUE,
                    # arguments for ci
                    ci=TRUE, ci.alpha=0.9, stratified=FALSE,
                    # arguments for plot
                    plot=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE,
                    print.auc=TRUE, show.thres=TRUE)

pROChatesonar <- pROC::roc(ifelse(davidson$count>=2,1,0),davidson$prob_hate,smoothed = TRUE,
                     # arguments for ci
                     ci=TRUE, ci.alpha=0.9, stratified=FALSE,
                     # arguments for plot
                     plot=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE,
                     print.auc=TRUE, show.thres=TRUE)

#plot(pROChatesonar,col="blue",main="") # ROC for hatesonar (blue) and toxicity (red)")
#lines(pROCtoxicity,col="red")
pdf("roc.pdf",width=4,height=4,paper='special')
par(pty="s")
plot.new()
plot.window(xlim=c(1,0),ylim=c(0,1))
axis(1,at=c(0,1),labels=c(0,1),mgp=c(0,0.6,0))
axis(2,at=c(0,1),labels=c(0,1),mgp=c(0,0.5,0))
title(main="", sub="",
      mgp=c(1,0,0),xlab="Specificity", ylab="Sensitivity") 
lines(smooth(pROChatesonar,n=60,method="density"),col="blue",lwd=2)
lines(smooth(pROCtoxicity,n=60,method="density"),col="red",lwd=2)
#text(0.85,0.9,"Toxicity",col="red")
#text(0.5,0.7,"HateSonar",col="blue")
# smaller image:
text(0.86,0.96,"Perspective",col="red",cex=0.8)
text(0.39,0.7,"Davidson et al.",col="blue",cex=0.8)
dev.off()
# I outputed roc.eps using 500x500

#library(ggplot2)
#g2 <- ggroc(list(Toxicity=pROCtoxicity, HateSonar=pROChatesonar)) #, ndka=roc(aSAH$outcome, aSAH$ndka)))
#g2 + theme_minimal() + #ggtitle("My ROC curve") + 
#  geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="grey", linetype="dashed")


####
## ANNOTATOR DIFFERENCE CURVES R ISOTONIC
####
annotators = c("annotator_1","annotator_2","annotator_3", "annotator_4","annotator_5")
annotatorcolor = c("indianred","orange","green4","hotpink","purple")
plot.new()
axis(1)
axis(2)
lines(isoreg(E,D),col="black",lwd=3)
for(i in 1:5)
  lines(isoreg(toxicity$TOXICITY,
               toxicity[,i+2]),
        col=annotatorcolor[i])
lines(c(0,1),c(0,1),col="grey")
# I outputed isoreg-annotators.eps using 500x500




####
## STAN 
####
library("splines2")
library("rstan")
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

sigmoid = function(x) {
  1 / (1 + exp(-x))
}

runstan_isotone <- function(annotated,score) {
  I <- t(ibs(score, knots=append(seq(0.1,0.9,0.1),0.95), degree=3, intercept = TRUE,Boundary.knots = c(0,1)))
  fit <- rstan::stan("sigmoid-spline-isotone.stan" , 
                                 data = list(num_data=length(annotated),num_basis=nrow(I),
                                                 annotated=annotated,I=I,score=score,justprior=0),
                                     chains = 4)
  return(extract(fit,permuted=TRUE))
}

runstan_nonisotone <- function(annotated,score) {
  B <- t(bSpline(score, knots=seq(0.1,0.9,0.1), degree=3, intercept = TRUE,Boundary.knots = c(0,1)))
  fit <- rstan::stan("sigmoid-spline.stan" , 
                                         data = list(num_data=length(annotated),num_basis=nrow(B),
                                                          annotated=annotated,B=B,score=score,justprior=0),
                                         chains = 4)
  return(extract(fit,permuted=TRUE))
}

t_col <- function(color, alphas) {
  #      color = color name
  #    percent = % transparency
  
  ## Get RGB values for named color
  rgb.val <- col2rgb(color,alpha=TRUE)
  
  ## Make new color using input color as base and alpha set by transparency
  t.col <- rgb(rgb.val[1], rgb.val[2], rgb.val[3],
               max = 255,
               alpha = alphas * rgb.val[4],
               names = NULL)
  
  ## Save the color
  invisible(t.col)
}

plotstan.isotone <- function(ds,color,bestcol=color,thinamount,bestlwd=3) {
  xs <- seq(0,1,0.01)
  Idisplay <- t(ibs(xs, knots=append(seq(0.1,0.9,0.1),0.95), degree=3, intercept = TRUE))
  Zmin = 1*xs
  Zmax = 0*xs
  ys = array(,dim=c((length(ds$a))/thinamount,length(xs)))
  for(i in 1:((length(ds$a[,1]))/thinamount)) {
      #if(divergent[i*10]==0) 
      {
        Z2 <- as.vector(0*xs + ds$a[i*thinamount,]%*%Idisplay)
        Z3 <- 2* sigmoid(Z2 + ds$b[i*thinamount])-1
        ys[i,] <- Z3
        points(xs,Z3,col=t_col(color,0.02),type="l")
      }}
    
    Zmax = apply(ys,2,sort)[.99*length(ds$a[,1])/thinamount,]
    Zmin = apply(ys,2,sort)[.01*length(ds$a[,1])/thinamount,]
    points(xs,Zmax,col=color,type="l")
    points(xs,Zmin,col=color,type="l")
    
    best = which.max(ds$lp__)
    Z2 <- as.vector(0*xs + ds$a[best,]%*%Idisplay)
    Z3 <- 2*sigmoid(Z2 + ds$b[i*thinamount])-1
    points(xs,Z3,col=bestcol,type="l",lwd=bestlwd)
    lines(c(0,1),c(0,1),col="grey")
}

plotstan.nonisotone <- function(ds,color,bestcol=color,thinamount) {
  xs <- seq(0,1,0.01)
  Bdisplay <- t(bSpline(xs, knots=seq(0.1,0.9,0.1), degree=3, intercept = TRUE,Boundary.knots = c(0,1)))
  Zmin = 1*xs
  Zmax = 0*xs
  ys = array(,dim=c((length(ds$a[,1]))/thinamount,length(xs)))
  for(i in 1:((length(ds$a[,1]))/thinamount)) {
    ##  Z2 <- as.vector(0*E + all[20*i,]%*%I)
    ##  Z3 <- Z2/Z2[num_data]
    Z2 <- as.vector(0*xs + ds$a[thinamount*i,]%*%Bdisplay)
    Z3 <- sigmoid(Z2)
    ys[i,] <- Z3
    points(xs,Z3,col=t_col(color,0.02),type="l")
  } 
  Zmax = apply(ys,2,sort)[.99*length(ds$a[,1])/thinamount,]
  Zmin = apply(ys,2,sort)[.01*length(ds$a[,1])/thinamount,]
  points(xs,Zmax,col=color,type="l")
  points(xs,Zmin,col=color,type="l")
  
  best = which.max(ds$lp__)
  Z2 <- as.vector(0*xs + ds$a[best,]%*%Bdisplay)
  Z3 <- sigmoid(Z2)
  points(xs,Z3,col=bestcol,type="l",lwd=3)
  points(xs,xs,col="grey",type="l")
}


dstoxicitynoniso <- runstan_nonisotone(D,E)
dstoxicityiso <- runstan_isotone(D,E)
dsdavidsoniso <- runstan_isotone(davidson$count,davidson$prob_hate)
pdf("calibration-curve0.95.pdf",width=5,height=5,paper='special')
par(pty="s")
plot.new()
axis(1)
axis(2)
plotstan.isotone(dsdavidsoniso,color="blue",bestcol=rgb(0,0,0.5,1),thinamount=10)
plotstan.isotone(dstoxicityiso,color="red",bestcol=rgb(0.5,0,0,1),thinamount=10)
title(main="", sub="",
      mgp=c(2.5,2.5,0),xlab="Classifier score for toxicity/hate", ylab="Probability of toxicity/hate") 
text(0.81,0.05,"Perspective",col="red")
text(0.5,0.3,"Davidson et al.",col="blue")
dev.off()
# PLOTTED AS calibration-curve.eps

pdf("stan-v-isotone.pdf",width=4,height=4,paper='special')
par(pty="s")
plot.new()
axis(1,at=c(0,1),labels=c(0,1),mgp=c(0,0.6,0))
axis(2,at=c(0,1),labels=c(0,1),mgp=c(0,0.5,0))
plotstan.isotone(dstoxicityiso,color="red",bestcol=rgb(0.8,0,0,0),thinamount=10)
lines(isoreg(E,D),col=rgb(0.5,0,0.5,1),lwd=2)
title(main="", sub="",
      mgp=c(1,0,0),xlab="Perspective toxicity score", ylab="Probability of toxicity") 
dev.off()
# PLOTTED as stan-v-isotone 3.5x3.5

dstoxicitynonisosplitA <- runstan_nonisotone(D[seq(1, length(D),2)],E[seq(1, length(E),2)])
dstoxicitynonisosplitB <- runstan_nonisotone(D[seq(2, length(D),2)],E[seq(2, length(E),2)])
dstoxicityisosplitA <- runstan_isotone(D[seq(1, length(D),2)],E[seq(1, length(E),2)])
dstoxicityisosplitB <- runstan_isotone(D[seq(2, length(D),2)],E[seq(2, length(E),2)])

plot.new()
axis(1)
axis(2)
plotstan.isotone(dstoxicityisosplitA,color="purple",bestcol=rgb(0,0,0,0),thinamount=10)
plotstan.isotone(dstoxicityisosplitB,color="orange",bestcol=rgb(0,0,0,0),thinamount=10)
plotstan.isotone(dstoxicityiso,color=rgb(0,0,0,0),bestcol=rgb(0,0,0,1),thinamount=10)
title(main="", sub="",
      mgp=c(2.5,2.5,0),xlab="Perspective toxicity score", ylab="Probability of toxicity") 
# PLOTTED as stan-with-split


getisotoneconfidencemax <- function(ds,thinamount=10) {
  xs <- seq(0,1,0.01)
  Idisplay <- t(ibs(xs, knots=seq(0.1,0.9,0.1), degree=3, intercept = TRUE))
  Zmin = 1*xs
  Zmax = 0*xs
  ys = array(,dim=c((length(ds$a))/thinamount,length(xs)))
  for(i in 1:((length(ds$a[,1]))/thinamount)) {
    #if(divergent[i*10]==0) 
    {
      Z2 <- as.vector(0*xs + ds$a[i*thinamount,]%*%Idisplay)
      Z3 <- 2* sigmoid(Z2 + ds$b[i*thinamount])-1
      ys[i,] <- Z3
    }}
  Zmax = apply(ys,2,sort)[.99*length(ds$a[,1])/thinamount,]
  Zmin = apply(ys,2,sort)[.01*length(ds$a[,1])/thinamount,]
  return(max(Zmax-Zmin))
}


splits = seq(100,1000,100)
dstoxicitysplit = NULL
for(n in splits) {
  fit <- runstan_isotone(D[seq(1, length(D),1000/n)],E[seq(1, length(E),1000/n)])
  dstoxicitysplit[n/100] = list(fit)
}
confidence = rep(0,length(splits))
for(n in 1:10) {
  confidence[n] = getisotoneconfidencemax(dstoxicitysplit[[n]])
}
plot(splits,confidence,xlim=c(100,1000),ylim=c(0,1),xlab="",ylab="")
lines(predict(lo))
lines(smooth.spline(splits, confidence, spar=0.5))
title(main="", sub="",
      mgp=c(2.5,2.5,0),xlab="Number of annotations", ylab="Maximum 98% confidence interval") 
# PLOTTED as confidence-improves.cps


dstoxicitysplitpowers = NULL
for(n in 1:6) {
  fit <- runstan_isotone(D[seq(1, length(D),1000/2^n)],E[seq(1, length(E),1000/2^n)])
  dstoxicitysplitpowers[n] = list(fit)
}
colorchoices=c("black","purple","blue","green","orange","red")
plot.new()
axis(1)
axis(2)
for(n in 2:6) {
  plotstan.isotone(dstoxicitysplitpowers[[n]],colorchoices[n],bestcol=rgb(0,0,0,0),thinamount=10)
}
plotstan.isotone(dstoxicitysplit[[1]],colorchoices[1],bestcol=rgb(0,0,0,1),thinamount=10)
title(main="", sub="",
      mgp=c(2.5,2.5,0),xlab="Perspective toxicity score", ylab="Probability of toxicity") 
# PLOTTED as confidence-improvesB.cps


####
## ANNOTATOR DIFFERENCE CURVES STAN
####
annotators = c("Anastasiou","Burden","Furman", "Salzano","Whittinger")
annotatorcolor = c("indianred","orange","green4","hotpink","purple")

dstoxicitysplitannotator = NULL
for(n in 1:5) {
  fit <- runstan_isotone(sortedtoxicity[,n+2],E)
  dstoxicitysplitannotator[n] = list(fit)
}
par(pty="s")
pdf("stan-annotators0.95.pdf",width=4,height=4,paper='special')
plot.new()
axis(1,at=c(0,1),labels=c(0,1),mgp=c(0,0.6,0))
axis(2,at=c(0,1),labels=c(0,1),mgp=c(0,0.5,0))
for(n in 1:5) {
  plotstan.isotone(dstoxicitysplitannotator[[n]],rgb(0,0,0,0),bestcol=t_col(annotatorcolor[n],0.7),thinamount=10,bestlwd=2)
}
plotstan.isotone(dstoxicityiso,rgb(0,0,0,0),bestcol=rgb(0,0,0,0.7),thinamount=10,bestlwd=3)
title(main="", sub="",
      mgp=c(1,0,0),xlab="Perspective toxicity score", ylab="Probability of toxicity") 
dev.off()
# PLOT stan-annotators



