library(lme4)
library(lmerTest)

bnc <- read.csv('/Volumes/Disk1/erp/results_conll/bnc_nonac_gpt2-ft.csv')

# Log transform variables
bnc$logh <- log(bnc$xu_h)
bnc$loghdoc <- log(bnc$xu_h_document)
bnc$logp <- log(bnc$position)


# ================ Decontextualised information content ================

# -------------- Document --------------
m <- lmer(logh ~ 1 + logp + (1 + logp | path), bnc)
summary(m)
# Random effects:
 # Groups   Name        Variance  Std.Dev. Corr 
 # path     (Intercept) 0.0098616 0.09931       
          # logp        0.0003387 0.01840  -0.79
 # Residual             0.0340805 0.18461       
# Number of obs: 328472, groups:  path, 228

# Fixed effects:
              # Estimate Std. Error         df t value Pr(>|t|)    
# (Intercept)  -0.030351   0.006981 223.249002  -4.348 2.09e-05 ***
# logp          0.001729   0.001289 196.242855   1.342    0.181    


# ================ Contextualised information content ================

# -------------- Document --------------
m <- lmer(loghdoc ~ 1 + logp + (1 + logp | path), bnc)
summary(m)
# Random effects:
 # Groups   Name        Variance  Std.Dev. Corr 
 # path     (Intercept) 0.0147986 0.12165       
          # logp        0.0004662 0.02159  -0.73
 # Residual             0.1125698 0.33551       
# Number of obs: 328451, groups:  path, 228

# Fixed effects:
              # Estimate Std. Error         df t value Pr(>|t|)    
# (Intercept)   0.015539   0.009062 224.057530   1.715   0.0878 .  
# logp         -0.010235   0.001605 181.760052  -6.377 1.45e-09 ***


# ==================== Mutual information ====================

# -------------- Document --------------
m <- lmer(xu_mi_document ~ 1 + logp + (1 + logp | path), bnc)
summary(m)
# Random effects:
 # Groups   Name        Variance Std.Dev. Corr 
 # path     (Intercept) 0.066807 0.25847       
          # logp        0.001957 0.04424  -0.71
 # Residual             1.182406 1.08738       
# Number of obs: 328446, groups:  path, 228

# Fixed effects:
             # Estimate Std. Error        df t value Pr(>|t|)    
# (Intercept) 7.028e-01  2.148e-02 2.316e+02   32.72   <2e-16 ***
# logp        4.786e-02  3.679e-03 1.900e+02   13.01   <2e-16 ***