library(xlsx)
library(lme4)
library(lmtest)

d = read.xlsx('combined.xlsx', sheetIndex = 1)
d = d[ d$dataset != 'zh-hk', ]
d$lang = sapply(as.character(d$dataset), function(x) { strsplit(x, '_')[[1]][1] })
d$dataset = sapply(as.character(d$dataset), function(x) { strsplit(x, '_')[[1]][2] })
tail(d)

## Mixed-effects modeling

fit1 = lmer(
    en_zero_shot_f ~ lang_super_gsd_f + en_ewt_super_f + 1|lang,
    data = na.omit(d[d$dataset=='gsd',])
)
fit2 = lmer(
    en_zero_shot_f ~ transfer + lang_super_gsd_f + en_ewt_super_f + 1|lang,
    data = na.omit(d[d$dataset=='gsd',])
)
lrtest(fit1, fit2)

fit1 = lmer(
    en_zero_shot_f ~ lang_super_gsd_f + en_ewt_super_f + 1|lang,
    data = na.omit(d[d$dataset=='pud',])
)
fit2 = lmer(
    en_zero_shot_f ~ transfer + lang_super_gsd_f + en_ewt_super_f + 1|lang,
    data = na.omit(d[d$dataset=='pud',])
)
lrtest(fit1, fit2)

## Simple correlations

for (lang in c('ru', 'fr', 'zh', 'ko', 'ja')) {
    c = cor(
        as.matrix(
            na.omit(
                d[(d$dataset == 'pud') & (d$lang == lang),
                  c('en_zero_shot_f', 'transfer')])),
        method='spearman')[1,2]
    cat(sprintf("%s: %f\n", lang, c))
}
