library(lme4)
library(lmerTest)

bncs <- read.csv('/Volumes/Disk1/erp/results_conll/bnc_spoken_gpt2-ft.csv')

# Log transform variables
bncs$logh <- log(bncs$xu_h)
bncs$loghdial<- log(bncs$xu_h_dialogue)
bncs$logp <- log(bncs$position)


# ================ Decontextualised information content ================

# -------------- Dialogue --------------
m <- lmer(logh ~ 1 + logp + (1 + logp |dialogue_id), bncs)
summary(m)
# Random effects:
 # Groups      Name        Variance  Std.Dev. Corr 
 # dialogue_id (Intercept) 0.0042225 0.06498       
             # logp        0.0001276 0.01129  -0.81
 # Residual                0.0853238 0.29210       
# Number of obs: 132101, groups:  dialogue_id, 187

# Fixed effects:
              # Estimate Std. Error         df t value Pr(>|t|)    
# (Intercept)  -0.049385   0.006770 166.585303  -7.295 1.16e-11 ***
# logp          0.001414   0.001230 149.969370   1.150    0.252  


# ================ Contextualised information content ================

# -------------- Dialogue --------------
m <- lmer(xu_h_dialogue ~ 1 + logp + (1 + logp | dialogue_id), bncs)
summary(m)
# Random effects:
 # Groups      Name        Variance Std.Dev. Corr 
 # dialogue_id (Intercept) 0.056833 0.23840       
             # logp        0.002005 0.04478  -0.95
 # Residual                0.399428 0.63200       
# Number of obs: 132069, groups:  dialogue_id, 187

# Fixed effects:
              # Estimate Std. Error         df t value Pr(>|t|)    
# (Intercept)   1.263502   0.020512 182.502045   61.60   <2e-16 ***
# logp         -0.050314   0.003877 172.921032  -12.98   <2e-16 ***


# ==================== Mutual information ====================

# -------------- Document --------------
m <- lmer(xu_mi_dialogue ~ 1 + logp + (1 + logp | dialogue_id), bncs)
summary(m)
# Random effects:
 # Groups      Name        Variance Std.Dev. Corr 
 # dialogue_id (Intercept) 0.09452  0.3074        
             # logp        0.00255  0.0505   -0.88
 # Residual                2.21244  1.4874        
# Number of obs: 132069, groups:  dialogue_id, 187

# Fixed effects:
             # Estimate Std. Error        df t value Pr(>|t|)    
# (Intercept) 5.893e-01  3.300e-02 1.712e+02   17.86   <2e-16 ***
# logp        7.728e-02  5.823e-03 1.600e+02   13.27   <2e-16 ***