library(lme4)
library(lmerTest)

ptb <- read.csv('/Volumes/Disk1/erp/results_conll/ptb_gpt2-ft.csv')


# Log transform variables
ptb$logh <- log(ptb$xu_h)

ptb$loghdoc<- log(ptb$xu_h_document)
ptb$loghpar <- log(ptb$xu_h_paragraph)

ptb$logpdoc <- log(ptb$position_in_doc)
ptb$logppar <- log(ptb$position_in_par)


# ================ Decontextualised information content ================

# -------------- Document --------------
m <- lmer(logh ~ 1 + logpdoc + (1 + logpdoc |doc_id), ptb)
summary(m)
# Random effects:
 # Groups   Name        Variance Std.Dev. Corr 
 # doc_id   (Intercept) 0.02946  0.17163       
          # logpdoc     0.00176  0.04195  -0.64
 # Residual             0.03564  0.18880       
# Number of obs: 8593, groups:  doc_id, 400

# Fixed effects:
              # Estimate Std. Error         df t value Pr(>|t|)    
# (Intercept)  -0.096701   0.010372 376.586288  -9.323  < 2e-16 ***
# logpdoc       0.021969   0.003528 180.882209   6.226 3.25e-09 ***

# -------------- Paragraph --------------
m <- lmer(logh ~ 1 + logppar + (1 + logppar |doc_id), ptb)
summary(m)
# Random effects:
 # Groups   Name        Variance Std.Dev. Corr 
 # doc_id   (Intercept) 0.022299 0.14933       
          # logppar     0.003878 0.06228  -0.50
 # Residual             0.035945 0.18959       
# Number of obs: 8593, groups:  doc_id, 400

# Fixed effects:
              # Estimate Std. Error         df t value Pr(>|t|)    
# (Intercept)  -0.071007   0.008330 381.207352  -8.524 3.65e-16 ***
# logppar       0.031426   0.005927 204.993706   5.302 2.95e-07 ***


# ================ Contextualised information content ================

# -------------- Document --------------
m <- lmer(loghdoc ~ 1 + logpdoc + (1 + logpdoc |doc_id), ptb)
summary(m)
# Random effects:
 # Groups   Name        Variance  Std.Dev. Corr 
 # doc_id   (Intercept) 0.0824394 0.28712       
          # logpdoc     0.0009459 0.03076  -0.08
 # Residual             0.0944031 0.30725       
# Number of obs: 8589, groups:  doc_id, 400

# Fixed effects:
              # Estimate Std. Error         df t value Pr(>|t|)    
# (Intercept)  -0.013574   0.017042 398.777651  -0.797    0.426    
# logpdoc      -0.040543   0.004499 237.750046  -9.011   <2e-16 ***

# -------------- Paragraph --------------
m <- lmer(loghpar ~ 1 + logppar + (1 + logppar |doc_id), ptb)
summary(m)
# Random effects:
 # Groups   Name        Variance Std.Dev. Corr
 # doc_id   (Intercept) 0.02387  0.1545       
          # logppar     0.02658  0.1630   0.73
 # Residual             0.05396  0.2323       
# Number of obs: 8589, groups:  doc_id, 400

# Fixed effects:
              # Estimate Std. Error         df t value Pr(>|t|)    
# (Intercept)  -0.007300   0.008804 374.765715  -0.829    0.408    
# logppar      -0.183184   0.010860 268.646911 -16.868   <2e-16 ***


# ==================== Mutual information ====================

# -------------- Document --------------
m <- lmer(xu_mi_document ~ 1 + logpdoc + (1 + logpdoc |doc_id), ptb)
summary(m)
# Random effects:
 # Groups   Name        Variance Std.Dev. Corr
 # doc_id   (Intercept) 0.016974 0.13028      
          # logpdoc     0.005882 0.07669  1.00
 # Residual             0.501212 0.70796      
# Number of obs: 8589, groups:  doc_id, 400

# Fixed effects:
             # Estimate Std. Error        df t value Pr(>|t|)    
# (Intercept) 4.767e-01  2.204e-02 2.239e+03   21.63   <2e-16 ***
# logpdoc     2.545e-01  1.051e-02 2.355e+02   24.22   <2e-16 ***

# -------------- Paragraph --------------
# m <- lmer(xu_mi_paragraph ~ 1 + logppar + (1 + logppar |doc_id), ptb)
# summary(m)
# Random effects:
 # Groups   Name        Variance Std.Dev. Corr 
 # doc_id   (Intercept)  0.07729 0.2780        
          # logppar      0.21557 0.4643   -0.93
 # Residual             17.94340 4.2360        
# Number of obs: 8589, groups:  doc_id, 400

# Fixed effects:
             # Estimate Std. Error        df t value Pr(>|t|)    
# (Intercept)   0.24293    0.06449 272.97655   3.767 0.000202 ***
# logppar       1.62800    0.09572 167.79081  17.008  < 2e-16 ***

