#####
## options
CENTERED = 1


#####
## utilities, etc
repmat <- function(a,n,m) {kronecker(matrix(1,n,m),a)}


#####
## read in data
subjects = c("SPR7","SPR8","SPR9","SPR10","SPR11","SPR12","SPR13","SPR14","SPR16","SPR17","SPR18","SPR19","SPR20","SPR21","SPR22","SPR23","SPR24","SPR25","SPR28","SPR29","SPR30","SPR31","SPR32")
data.frame(read.table("readingdata.gf.des.tab", header=TRUE)) -> ModeledData
data.frame(read.table("readingdata.data.tab", header=TRUE)) -> EmpiricalData
data.frame(read.table("readingdata.brownnp.ngrampred.tab", header=TRUE)) -> NgramData
data.frame(read.table("readingdata.closed", header=TRUE)) -> ClassData


#####
## put all into one and do some data munging
TightData = EmpiricalData
TightData$unigram = NgramData$unigram
TightData$bigram = NgramData$bigram
TightData$surprisal = ModeledData$surprisal
TightData$entropyrdc = ModeledData$entropyrdc
TightData$avgdepth = ModeledData$avgdepth
TightData$closed = ClassData$closed

# add depthdiff variable (ok done this way b/c will discard 1st and last word/sentence
depthdiff = c()
depthdiff[1] = ModeledData$avgdepth[1]
for (i in 2:length(TightData$word)) {
  depthdiff[i] = ModeledData$avgdepth[i]-ModeledData$avgdepth[i-1]
}
TightData$depthdiff = depthdiff
# add sentence variable
sentence = c()
sentence[1] = 1#TightData$sc.initial[1]
for (i in 2:length(TightData$word)) {
  sentence[i] = sentence[i-1] + TightData$sc.initial[i]
}
TightData$sentence = sentence
# add wordlength variable (and reciprocal)
TightData$length = nchar(as.character(TightData$word))
TightData$rlength = 1/TightData$length
# add log versions of several things
TightData$unigram[which(TightData$unigram==0)] = .00000001
TightData$bigram[which(TightData$bigram==0)] = .00000001
TightData$logunigram = log(TightData$unigram)
TightData$logbigram = log(TightData$bigram)
# erase first and last words, each line
ind = which( TightData$sc.initial!=0 | TightData$sc.final!=0 )
TightData = TightData[-ind,]
# use mean-centered versions
if (CENTERED==1) {
  CTightData = TightData
  CTightData$order = TightData$order - mean(TightData$order)
  CTightData$unigram = TightData$unigram - mean(TightData$unigram)
  CTightData$bigram = TightData$bigram - mean(TightData$bigram)
  CTightData$surprisal = TightData$surprisal - mean(TightData$surprisal)
  CTightData$entropyrdc = TightData$entropyrdc - mean(TightData$entropyrdc)
  CTightData$avgdepth = TightData$avgdepth - mean(TightData$avgdepth)
  CTightData$depthdiff = TightData$depthdiff - mean(TightData$depthdiff)
  CTightData$sentence = TightData$sentence - mean(TightData$sentence)
  CTightData$length = TightData$length - mean(TightData$length)
  CTightData$rlength = TightData$rlength - mean(TightData$rlength)
  CTightData$logunigram = TightData$logunigram - mean(TightData$logunigram)
  CTightData$logbigram = TightData$logbigram - mean(TightData$logbigram)
  #CTightData$closed = TightData$closed - mean(TightData$closed)
  OtherData = CTightData
} else {
  OtherData = TightData
}
NUM_WORDS = length(OtherData$word)
rm(ModeledData,NgramData)#,EmpiricalData)


#####
## transform the matrix of subjects' data into a vector, 
##    w/ repeated predictors for each subject
SPR = c()
for (i in 1:23) {#c(7:14,16:25,28:32)) {
	SPR = c(SPR,OtherData[[ subjects[i] ]])
	OtherData[,names(OtherData)==subjects[i]] = c()
}
SPR = cbind( rep(c(7:14,16:25,28:32),each=NUM_WORDS),SPR )
headers = names(OtherData)
 
flatData = SPR
for (i in seq(length(OtherData),1,-1)) {
	flatData = cbind( OtherData[,i],flatData )
}
FlatData = as.data.frame(flatData)
names(FlatData) = c(headers, "spr", "time")
rm(flatData,SPR)

# remove too-fast or too-slow responses
FlatData = FlatData[-which(FlatData$time<150),]
FlatData = FlatData[-which(FlatData$time>1500),]

#####
## run the model

  # by default, use log and mean-centered time data
  FlatData$time = log(FlatData$time) - mean(log(FlatData$time))

  # test for what word length to use
#  none.nolength = lmer( time ~ order + unigram + bigram + (1|spr) + (1|word) + (1|sentence), data=FlatData )
#  none.length = update( none.nolength, time ~ order + length + unigram + bigram + (1|spr) + (1|word) + (1|sentence) )
#  none.rlength = update( none.nolength, time ~ order + rlength + unigram + bigram + (1|spr) + (1|word) + (1|sentence) )

  # test for how to calculate times
#  none.time = lmer( time ~ order + unigram + bigram + rlength + (1|spr) + (1|word) + (1|sentence), data=FlatData )
#  none.logtime = lmer( time ~ order + unigram + bigram + rlength + (1|spr) + (1|word) + (1|sentence), data=FlatData )

  # one-at-a-time for the new metrics
#  s.1   = lmer( time ~ order + rlength + unigram + bigram + surprisal  + (1|spr) + (1|word) + (1|sentence), data=FlatData )
#  e.1   = lmer( time ~ order + rlength + unigram + bigram + entropyrdc + (1|spr) + (1|word) + (1|sentence), data=FlatData )
#  d.1   = lmer( time ~ order + rlength + unigram + bigram + depthdiff   + (1|spr) + (1|word) + (1|sentence), data=FlatData )
  es.1  = lmer( time ~ order + rlength + unigram + bigram + entropyrdc + surprisal  + (1|spr) + (1|word) + (1|sentence), data=FlatData )
  ds.1  = lmer( time ~ order + rlength + unigram + bigram + depthdiff   + surprisal  + (1|spr) + (1|word) + (1|sentence), data=FlatData )
  de.1  = lmer( time ~ order + rlength + unigram + bigram + depthdiff   + entropyrdc + (1|spr) + (1|word) + (1|sentence), data=FlatData )
  des.1 = lmer( time ~ order + rlength + unigram + bigram + depthdiff   + entropyrdc + surprisal  + (1|spr) + (1|word) + (1|sentence), data=FlatData )
  sonly.1 = lmer ( time ~ surprisal + (1|spr) + (1|word) + (1|sentence), data = FlatData )
#sink("readingtime.centlogtime.results")
#summary(s.1); summary(e.1); summary(d.1)
#summary(es.1); summary(ds.1); summary(de.1); summary(des.1)
#anova(des.1,es.1); anova(des.1,ds.1); anova(des.1,de.1)
#anova(des.1,s.1); anova(des.1,e.1); anova(des.1,d.1)
#anova(es.1,e.1); anova(es.1,s.1)
#anova(ds.1,d.1); anova(ds.1,s.1)
#anova(de.1,d.1); anova(de.1,e.1)

#desinter.1 = lmer( time ~ (order + rlength + unigram + bigram + depthdiff + entropyrdc + surprisal)*closed + (1|spr) + (1|word) + (1|sentence), data=FlatData )
#sink("readingtime.centlogtime.inter.results")
#summary(desinter.1)

  # split between open and closed classes
#ClosedData = FlatData[-which(FlatData$closed==1),]
#s.c   = lmer( time ~ order + rlength + unigram + bigram + surprisal  + (1|spr) + (1|word) + (1|sentence), data=ClosedData )
#e.c   = lmer( time ~ order + rlength + unigram + bigram + entropyrdc + (1|spr) + (1|word) + (1|sentence), data=ClosedData )
#d.c   = lmer( time ~ order + rlength + unigram + bigram + depthdiff   + (1|spr) + (1|word) + (1|sentence), data=ClosedData )
#es.c  = lmer( time ~ order + rlength + unigram + bigram + entropyrdc + surprisal  + (1|spr) + (1|word) + (1|sentence), data=ClosedData )
#ds.c  = lmer( time ~ order + rlength + unigram + bigram + depthdiff   + surprisal  + (1|spr) + (1|word) + (1|sentence), data=ClosedData )
#de.c  = lmer( time ~ order + rlength + unigram + bigram + depthdiff   + entropyrdc + (1|spr) + (1|word) + (1|sentence), data=ClosedData )
#des.c = lmer( time ~ order + rlength + unigram + bigram + depthdiff   + entropyrdc + surprisal  + (1|spr) + (1|word) + (1|sentence), data=ClosedData )
#sink("readingtime.centlogtime.closed.results")
#summary(s.c); summary(e.c); summary(d.c)
#summary(es.c); summary(ds.c); summary(de.c); summary(des.c)
#anova(des.c,es.c); anova(des.c,ds.c); anova(des.c,de.c)
#anova(des.c,s.c); anova(des.c,e.c); anova(des.c,d.c)
#anova(es.c,e.c); anova(es.c,s.c)
#anova(ds.c,d.c); anova(ds.c,s.c)
#anova(de.c,d.c); anova(de.c,e.c)
  
#OpenData = FlatData[-which(FlatData$closed==0),]
#s.o   = lmer( time ~ order + rlength + unigram + bigram + surprisal  + (1|spr) + (1|word) + (1|sentence), data=OpenData )
#e.o   = lmer( time ~ order + rlength + unigram + bigram + entropyrdc + (1|spr) + (1|word) + (1|sentence), data=OpenData )
#d.o   = lmer( time ~ order + rlength + unigram + bigram + depthdiff   + (1|spr) + (1|word) + (1|sentence), data=OpenData )
#es.o  = lmer( time ~ order + rlength + unigram + bigram + entropyrdc + surprisal  + (1|spr) + (1|word) + (1|sentence), data=OpenData )
#ds.o  = lmer( time ~ order + rlength + unigram + bigram + depthdiff   + surprisal  + (1|spr) + (1|word) + (1|sentence), data=OpenData )
#de.o  = lmer( time ~ order + rlength + unigram + bigram + depthdiff   + entropyrdc + (1|spr) + (1|word) + (1|sentence), data=OpenData )
#des.o = lmer( time ~ order + rlength + unigram + bigram + depthdiff   + entropyrdc + surprisal  + (1|spr) + (1|word) + (1|sentence), data=OpenData )
#sink("readingtime.centlogtime.open.results")
#summary(s.o); summary(e.o); summary(d.o)
#summary(es.o); summary(ds.o); summary(de.o); summary(des.o)
#anova(des.o,es.o); anova(des.o,ds.o); anova(des.o,de.o)
#anova(des.o,s.o); anova(des.o,e.o); anova(des.o,d.o)
#anova(es.o,e.o); anova(es.o,s.o)
#anova(ds.o,d.o); anova(ds.o,s.o)
#anova(de.o,d.o); anova(de.o,e.o)

  
#  sink()

