First load the data

library(tictoc)
library(caret)

df <- read.csv("../reports/probing_results_400_per_class/task1_predict_task_performance.csv")
dim(df)
[1] 25 98
all_glue_tasks = c("rte", "cola", "mrpc", "sst2", "qnli", "qqp")
all_probe_tasks = c("bigram_shift", "coordination_inversion", "obj_number", "odd_man_out", "past_present", "subj_number", "tree_depth")

1. Probing from all layers in one task

all_layers_from_one_task <- function(glue_task, probe_task) {
  layers=1:12
  features = paste(paste(probe_task, "_layer_", sep=""), layers, sep="")
  x_y_features = c(glue_task, features)
  formula = as.formula(paste(glue_task, "~ ."))
  trcontrol <- trainControl(method="cv", number=5)
  model <- train(formula, data=df[x_y_features], method="lm", trControl=trcontrol)
  rmse <- sqrt(mean(summary(model)$residuals^2))
  
  ctrl_features <- matrix(rnorm(length(features) * nrow(df), 0, 0.1), 
                         nrow=nrow(df), ncol=length(features))
  ctrl_label <- df[glue_task]
  Z <- as.data.frame(cbind(ctrl_label, ctrl_features))
  ctrl_model <- train(
    as.formula(sprintf("%s ~ .", glue_task)), data=Z, method="lm", trControl=trcontrol)
  ctrl_rmse <- sqrt(mean(summary(ctrl_model)$residuals^2))
  
  SST <- var(df[glue_task]) * (length(df)-1)
  SSE <- deviance(model)
  return(list("RMSE"=rmse,
              "ctrl_RMSE"=ctrl_rmse,
              "RMSE_reduction"=(ctrl_rmse-rmse)/ctrl_rmse*100,
              "explained_var"=(SST-SSE) / SST * 100 ))
}

set.seed(1234)
for (gt in all_glue_tasks) {
  print(sprintf("Predict %s", gt))
  for (pt in all_probe_tasks) {
    ret = all_layers_from_one_task(gt, pt)
    print(sprintf("probing task %s. RMSE %.4f. ctrl_RMSE %.4f RMSE_reduction %.2f", pt, ret$RMSE, ret$ctrl_RMSE, ret$RMSE_reduction))
  }
}
[1] "Predict rte"
[1] "probing task bigram_shift. RMSE 0.0298. ctrl_RMSE 0.0377 RMSE_reduction 21.00"
[1] "probing task coordination_inversion. RMSE 0.0328. ctrl_RMSE 0.0343 RMSE_reduction 4.49"
[1] "probing task obj_number. RMSE 0.0324. ctrl_RMSE 0.0460 RMSE_reduction 29.51"
[1] "probing task odd_man_out. RMSE 0.0416. ctrl_RMSE 0.0396 RMSE_reduction -5.09"
[1] "probing task past_present. RMSE 0.0438. ctrl_RMSE 0.0444 RMSE_reduction 1.30"
[1] "probing task subj_number. RMSE 0.0368. ctrl_RMSE 0.0408 RMSE_reduction 9.89"
[1] "probing task tree_depth. RMSE 0.0441. ctrl_RMSE 0.0395 RMSE_reduction -11.49"
[1] "Predict cola"
[1] "probing task bigram_shift. RMSE 0.0090. ctrl_RMSE 0.0194 RMSE_reduction 53.66"
[1] "probing task coordination_inversion. RMSE 0.0111. ctrl_RMSE 0.0162 RMSE_reduction 31.28"
[1] "probing task obj_number. RMSE 0.0055. ctrl_RMSE 0.0125 RMSE_reduction 56.10"
[1] "probing task odd_man_out. RMSE 0.0144. ctrl_RMSE 0.0134 RMSE_reduction -7.14"
[1] "probing task past_present. RMSE 0.0086. ctrl_RMSE 0.0178 RMSE_reduction 51.79"
[1] "probing task subj_number. RMSE 0.0038. ctrl_RMSE 0.0162 RMSE_reduction 76.32"
[1] "probing task tree_depth. RMSE 0.0053. ctrl_RMSE 0.0155 RMSE_reduction 66.06"
[1] "Predict mrpc"
[1] "probing task bigram_shift. RMSE 0.0199. ctrl_RMSE 0.0247 RMSE_reduction 19.65"
[1] "probing task coordination_inversion. RMSE 0.0176. ctrl_RMSE 0.0228 RMSE_reduction 22.83"
[1] "probing task obj_number. RMSE 0.0145. ctrl_RMSE 0.0241 RMSE_reduction 39.94"
[1] "probing task odd_man_out. RMSE 0.0206. ctrl_RMSE 0.0246 RMSE_reduction 16.09"
[1] "probing task past_present. RMSE 0.0178. ctrl_RMSE 0.0269 RMSE_reduction 33.63"
[1] "probing task subj_number. RMSE 0.0142. ctrl_RMSE 0.0269 RMSE_reduction 47.19"
[1] "probing task tree_depth. RMSE 0.0162. ctrl_RMSE 0.0229 RMSE_reduction 28.98"
[1] "Predict sst2"
[1] "probing task bigram_shift. RMSE 0.0046. ctrl_RMSE 0.0072 RMSE_reduction 35.60"
[1] "probing task coordination_inversion. RMSE 0.0049. ctrl_RMSE 0.0080 RMSE_reduction 38.93"
[1] "probing task obj_number. RMSE 0.0026. ctrl_RMSE 0.0077 RMSE_reduction 65.95"
[1] "probing task odd_man_out. RMSE 0.0070. ctrl_RMSE 0.0077 RMSE_reduction 9.36"
[1] "probing task past_present. RMSE 0.0061. ctrl_RMSE 0.0071 RMSE_reduction 13.52"
[1] "probing task subj_number. RMSE 0.0043. ctrl_RMSE 0.0084 RMSE_reduction 48.62"
[1] "probing task tree_depth. RMSE 0.0054. ctrl_RMSE 0.0074 RMSE_reduction 27.13"
[1] "Predict qnli"
[1] "probing task bigram_shift. RMSE 0.0043. ctrl_RMSE 0.0089 RMSE_reduction 51.32"
[1] "probing task coordination_inversion. RMSE 0.0051. ctrl_RMSE 0.0054 RMSE_reduction 6.30"
[1] "probing task obj_number. RMSE 0.0024. ctrl_RMSE 0.0086 RMSE_reduction 72.30"
[1] "probing task odd_man_out. RMSE 0.0066. ctrl_RMSE 0.0066 RMSE_reduction -0.71"
[1] "probing task past_present. RMSE 0.0032. ctrl_RMSE 0.0064 RMSE_reduction 49.43"
[1] "probing task subj_number. RMSE 0.0026. ctrl_RMSE 0.0074 RMSE_reduction 65.36"
[1] "probing task tree_depth. RMSE 0.0033. ctrl_RMSE 0.0083 RMSE_reduction 59.93"
[1] "Predict qqp"
[1] "probing task bigram_shift. RMSE 0.0199. ctrl_RMSE 0.0469 RMSE_reduction 57.55"
[1] "probing task coordination_inversion. RMSE 0.0374. ctrl_RMSE 0.0502 RMSE_reduction 25.51"
[1] "probing task obj_number. RMSE 0.0150. ctrl_RMSE 0.0509 RMSE_reduction 70.54"
[1] "probing task odd_man_out. RMSE 0.0236. ctrl_RMSE 0.0441 RMSE_reduction 46.36"
[1] "probing task past_present. RMSE 0.0110. ctrl_RMSE 0.0409 RMSE_reduction 73.01"
[1] "probing task subj_number. RMSE 0.0187. ctrl_RMSE 0.0371 RMSE_reduction 49.42"
[1] "probing task tree_depth. RMSE 0.0256. ctrl_RMSE 0.0450 RMSE_reduction 43.10"

2. Which features are significant?

probing_from_one_task <- function(glue_task, probe_task) {
  layers=1:12
  features = paste(paste(probe_task, "_layer_", sep=""), layers, sep="")
  x_y_features = c(glue_task, features)
  formula = paste(glue_task, "~ .")
  model <- lm(formula, data=df[x_y_features])
  anova_result <- anova(model)
  rmse <- sqrt(mean(summary(model)$residuals^2))
  sig_features <- features[anova_result[,5]<0.05]
  
  ctrl_features <- matrix(rnorm(length(features) * nrow(df), 0, 0.1), 
                         nrow=nrow(df), ncol=length(features))
  ctrl_label <- df[glue_task]
  Z <- as.data.frame(cbind(ctrl_label, ctrl_features))
  ctrl_model <- lm(sprintf("%s ~ .", glue_task), data=Z)
  ctrl_rmse <- sqrt(mean(summary(ctrl_model)$residuals^2))
  
  SST <- var(df[glue_task]) * (length(df)-1)
  SSE <- deviance(model)
  return(list("anova_result"=anova_result, 
              "sig_features"=sig_features,
              "RMSE"=rmse,
              "RMSE_reduction"=(ctrl_rmse-rmse)/ctrl_rmse*100,
              "explained_var"=(SST-SSE) / SST * 100 ))
}

set.seed(1234)
for (gt in all_glue_tasks) {
  print(sprintf("Predict %s", gt))
  for (pt in all_probe_tasks) {
    ret = probing_from_one_task(gt, pt)
    anova_result = ret$anova_result
    sig_features = ret$sig_features
    print(sprintf("probing task %s", pt))
    print(sprintf(sig_features))
  }
}
[1] "Predict rte"
[1] "probing task bigram_shift"
[1] "bigram_shift_layer_4" "bigram_shift_layer_9" "NA"                  
[1] "probing task coordination_inversion"
[1] "coordination_inversion_layer_5" "NA"                            
[1] "probing task obj_number"
[1] "obj_number_layer_1" "obj_number_layer_3" "obj_number_layer_5"
[4] "NA"                
[1] "probing task odd_man_out"
[1] "odd_man_out_layer_6" "NA"                 
[1] "probing task past_present"
[1] "past_present_layer_1" "NA"                  
[1] "probing task subj_number"
[1] "subj_number_layer_3" "NA"                 
[1] "probing task tree_depth"
[1] "NA"
[1] "Predict cola"
[1] "probing task bigram_shift"
[1] "bigram_shift_layer_2"  "bigram_shift_layer_4"  "bigram_shift_layer_5" 
[4] "bigram_shift_layer_12" "NA"                   
[1] "probing task coordination_inversion"
[1] "coordination_inversion_layer_1"  "coordination_inversion_layer_7" 
[3] "coordination_inversion_layer_11" "NA"                             
[1] "probing task obj_number"
[1] "obj_number_layer_1"  "obj_number_layer_2"  "obj_number_layer_3" 
[4] "obj_number_layer_8"  "obj_number_layer_9"  "obj_number_layer_11"
[7] "obj_number_layer_12" "NA"                 
[1] "probing task odd_man_out"
[1] "odd_man_out_layer_5"  "odd_man_out_layer_12" "NA"                  
[1] "probing task past_present"
[1] "past_present_layer_1" "past_present_layer_4" "past_present_layer_5"
[4] "past_present_layer_8" "past_present_layer_9" "NA"                  
[1] "probing task subj_number"
[1] "subj_number_layer_1"  "subj_number_layer_4"  "subj_number_layer_6" 
[4] "subj_number_layer_10" "subj_number_layer_11" "NA"                  
[1] "probing task tree_depth"
[1] "tree_depth_layer_1"  "tree_depth_layer_2"  "tree_depth_layer_4" 
[4] "tree_depth_layer_6"  "tree_depth_layer_8"  "tree_depth_layer_12"
[7] "NA"                 
[1] "Predict mrpc"
[1] "probing task bigram_shift"
[1] "bigram_shift_layer_4" "NA"                  
[1] "probing task coordination_inversion"
[1] "coordination_inversion_layer_5" "coordination_inversion_layer_7"
[3] "NA"                            
[1] "probing task obj_number"
[1] "obj_number_layer_1" "obj_number_layer_3" "obj_number_layer_4"
[4] "NA"                
[1] "probing task odd_man_out"
[1] "odd_man_out_layer_6" "NA"                 
[1] "probing task past_present"
[1] "past_present_layer_1" "past_present_layer_7" "past_present_layer_8"
[4] "NA"                  
[1] "probing task subj_number"
[1] "subj_number_layer_1" "subj_number_layer_3" "subj_number_layer_4"
[4] "NA"                 
[1] "probing task tree_depth"
[1] "tree_depth_layer_1" "tree_depth_layer_7" "tree_depth_layer_8"
[4] "NA"                
[1] "Predict sst2"
[1] "probing task bigram_shift"
[1] "bigram_shift_layer_2"  "bigram_shift_layer_4"  "bigram_shift_layer_5" 
[4] "bigram_shift_layer_12" "NA"                   
[1] "probing task coordination_inversion"
[1] "coordination_inversion_layer_1"  "coordination_inversion_layer_7" 
[3] "coordination_inversion_layer_8"  "coordination_inversion_layer_11"
[5] "NA"                             
[1] "probing task obj_number"
[1] "obj_number_layer_1"  "obj_number_layer_3"  "obj_number_layer_4" 
[4] "obj_number_layer_5"  "obj_number_layer_8"  "obj_number_layer_9" 
[7] "obj_number_layer_11" "obj_number_layer_12" "NA"                 
[1] "probing task odd_man_out"
[1] "odd_man_out_layer_12" "NA"                  
[1] "probing task past_present"
[1] "past_present_layer_1" "past_present_layer_4" "past_present_layer_8"
[4] "NA"                  
[1] "probing task subj_number"
[1] "subj_number_layer_1" "subj_number_layer_4" "NA"                 
[1] "probing task tree_depth"
[1] "tree_depth_layer_1" "tree_depth_layer_6" "NA"                
[1] "Predict qnli"
[1] "probing task bigram_shift"
[1] "bigram_shift_layer_2" "bigram_shift_layer_4" "bigram_shift_layer_5"
[4] "NA"                  
[1] "probing task coordination_inversion"
[1] "coordination_inversion_layer_1"  "coordination_inversion_layer_11"
[3] "NA"                             
[1] "probing task obj_number"
[1] "obj_number_layer_1"  "obj_number_layer_3"  "obj_number_layer_5" 
[4] "obj_number_layer_8"  "obj_number_layer_9"  "obj_number_layer_11"
[7] "obj_number_layer_12" "NA"                 
[1] "probing task odd_man_out"
[1] "NA"
[1] "probing task past_present"
[1] "past_present_layer_1" "past_present_layer_4" "past_present_layer_7"
[4] "past_present_layer_8" "past_present_layer_9" "NA"                  
[1] "probing task subj_number"
[1] "subj_number_layer_1" "NA"                 
[1] "probing task tree_depth"
[1] "tree_depth_layer_1" "tree_depth_layer_2" "NA"                
[1] "Predict qqp"
[1] "probing task bigram_shift"
[1] "bigram_shift_layer_2" "bigram_shift_layer_4" "bigram_shift_layer_5"
[4] "bigram_shift_layer_8" "NA"                  
[1] "probing task coordination_inversion"
[1] "coordination_inversion_layer_8"  "coordination_inversion_layer_11"
[3] "NA"                             
[1] "probing task obj_number"
[1] "obj_number_layer_2"  "obj_number_layer_3"  "obj_number_layer_5" 
[4] "obj_number_layer_6"  "obj_number_layer_12" "NA"                 
[1] "probing task odd_man_out"
[1] "odd_man_out_layer_1"  "odd_man_out_layer_5"  "odd_man_out_layer_6" 
[4] "odd_man_out_layer_8"  "odd_man_out_layer_10" "NA"                  
[1] "probing task past_present"
[1] "past_present_layer_1"  "past_present_layer_2"  "past_present_layer_3" 
[4] "past_present_layer_7"  "past_present_layer_8"  "past_present_layer_10"
[7] "past_present_layer_11" "NA"                   
[1] "probing task subj_number"
[1] "subj_number_layer_1" "subj_number_layer_2" "subj_number_layer_3"
[4] "subj_number_layer_4" "subj_number_layer_5" "subj_number_layer_9"
[7] "NA"                 
[1] "probing task tree_depth"
[1] "tree_depth_layer_2" "tree_depth_layer_3" "tree_depth_layer_7"
[4] "NA"                

3. Probing from some layers from some tasks

Just use one layer for each probing task.


probing_some_layers_some_ptasks <- function(glue_task, features) {
  x_y_features = c(glue_task, features)
  formula = as.formula(paste(glue_task, "~ ."))
  # Need to convert to formula; otherwise caret throws error
  
  trctrl <- trainControl(method="cv", number=5)
  model <- train(formula, 
                 data=df[x_y_features], 
                 trControl=trctrl, 
                 method="lm")
  
  summary_result <- summary(model)
  rmse <- sqrt(mean(summary_result$residuals^2))
  
  ctrl_features <- matrix(rnorm(length(features) * nrow(df), 0, 0.1), 
                         nrow=nrow(df), ncol=length(features))
  ctrl_label <- df[glue_task]
  Z <- as.data.frame(cbind(ctrl_label, ctrl_features))
  ctrl_model <- train(
    as.formula(sprintf("%s ~ .", glue_task)), 
    data=Z, method="lm", 
    trControl=trainControl(method="cv", number=5))
  ctrl_rmse <- sqrt(mean(summary(ctrl_model)$residuals^2))
  if (ctrl_rmse == 0) {
    reduction = 0
  } else {
    reduction = (ctrl_rmse-rmse)/ctrl_rmse*100
  }

  return(list(
    "summary_result"=summary_result, 
    "RMSE"=rmse,
    "RMSE_reduction"=reduction ))
}

for (gt in all_glue_tasks) {
  features = c(
    "bigram_shift_layer_4",
    "coordination_inversion_layer_11",
    "obj_number_layer_3", 
    "odd_man_out_layer_6",  
    "past_present_layer_1",
    "subj_number_layer_5",
    "tree_depth_layer_1"  
  )
  ret <- probing_some_layers_some_ptasks(gt, features)
  print(sprintf("GLUE task %s, RMSE %.5f, RMSE_reduction %.2f", 
                gt, ret$RMSE, ret$RMSE_reduction))
}
[1] "GLUE task rte, RMSE 0.03586, RMSE_reduction 32.44"
[1] "GLUE task cola, RMSE 0.00679, RMSE_reduction 64.96"
[1] "GLUE task mrpc, RMSE 0.01892, RMSE_reduction 35.77"
[1] "GLUE task sst2, RMSE 0.00556, RMSE_reduction 39.00"
[1] "GLUE task qnli, RMSE 0.00357, RMSE_reduction 58.55"
[1] "GLUE task qqp, RMSE 0.02176, RMSE_reduction 60.91"

4. Predict from just 3 features

Feature elimination:
- Try a brute force iteration approach: This will take \(84*83*82\) runs; Without 5-fold CV this takes around 10 mins per GLUE task. With CV: doesnโ€™t finish within 2 hrs; too long. Optimize a bit: Just use lm to select features. When report, report CV results.
- Use the RFE by caret? The RMSE values are not as good as those from 12 features one ptask.

probing_some_layers_some_ptasks_fast <- function(glue_task, features) {
  x_y_features = c(glue_task, features)
  formula = as.formula(paste(glue_task, "~ ."))

  model <- lm(formula,data=df[x_y_features])
  
  summary_result <- summary(model)
  rmse <- sqrt(mean(summary_result$residuals^2))
  
  ctrl_features <- matrix(rnorm(length(features) * nrow(df), 0, 0.1), 
                         nrow=nrow(df), ncol=length(features))
  ctrl_label <- df[glue_task]
  Z <- as.data.frame(cbind(ctrl_label, ctrl_features))
  ctrl_model <- lm(sprintf("%s ~ .", glue_task), data=Z)
  ctrl_rmse <- sqrt(mean(summary(ctrl_model)$residuals^2))
  if (ctrl_rmse == 0) {
    reduction = 0
  } else {
    reduction = (ctrl_rmse-rmse)/ctrl_rmse*100
  }

  return(list(
    "summary_result"=summary_result, 
    "RMSE"=rmse,
    "RMSE_reduction"=reduction ))
}

all_probe_features <- outer(all_probe_tasks, paste0("_layer_", 1:12), FUN="paste0")

find_best_features <- function(glue_task) {
  best_features = NA
  smallest_rmse = 10000
  for (i in 1:(length(all_probe_features)-2)) {
    for (j in (i+1):(length(all_probe_features)-1)) {
      for (k in (j+1):length(all_probe_features)) {
        feats <- c(all_probe_features[i], all_probe_features[j], all_probe_features[k])
        ret <- probing_some_layers_some_ptasks_fast(glue_task, feats)
        if (ret$RMSE < smallest_rmse) {
          smallest_rmse = ret$RMSE
          best_features = feats
        }
      }
    }
  }
  ret <- probing_some_layers_some_ptasks(glue_task, best_features)
  return(list(
    "max_rmse_reduction"=ret$RMSE_reduction,
    "best_features"=best_features
  ))
}

for (gt in all_glue_tasks) {
  tic("find_best_features")
  retval = find_best_features(gt)
  toc()
  print(sprintf("Glue task %s, max rmse reduction %.2f, achieved using %s",
                gt, retval$max_rmse_reduction, retval$best_features))
}
find_best_features: 125.388 sec elapsed
[1] "Glue task rte, max rmse reduction 47.38, achieved using past_present_layer_1"           
[2] "Glue task rte, max rmse reduction 47.38, achieved using subj_number_layer_11"           
[3] "Glue task rte, max rmse reduction 47.38, achieved using coordination_inversion_layer_12"
find_best_features: 124.373 sec elapsed
[1] "Glue task cola, max rmse reduction 77.84, achieved using subj_number_layer_1" 
[2] "Glue task cola, max rmse reduction 77.84, achieved using bigram_shift_layer_6"
[3] "Glue task cola, max rmse reduction 77.84, achieved using tree_depth_layer_8"  
find_best_features: 124.03 sec elapsed
[1] "Glue task mrpc, max rmse reduction 56.70, achieved using bigram_shift_layer_2"
[2] "Glue task mrpc, max rmse reduction 56.70, achieved using obj_number_layer_7"  
[3] "Glue task mrpc, max rmse reduction 56.70, achieved using odd_man_out_layer_9" 
find_best_features: 126.714 sec elapsed
[1] "Glue task sst2, max rmse reduction 72.27, achieved using subj_number_layer_1"           
[2] "Glue task sst2, max rmse reduction 72.27, achieved using past_present_layer_2"          
[3] "Glue task sst2, max rmse reduction 72.27, achieved using coordination_inversion_layer_6"
find_best_features: 126.651 sec elapsed
[1] "Glue task qnli, max rmse reduction 82.01, achieved using subj_number_layer_1" 
[2] "Glue task qnli, max rmse reduction 82.01, achieved using subj_number_layer_8" 
[3] "Glue task qnli, max rmse reduction 82.01, achieved using bigram_shift_layer_9"
find_best_features: 124.525 sec elapsed
[1] "Glue task qqp, max rmse reduction 71.08, achieved using past_present_layer_3"
[2] "Glue task qqp, max rmse reduction 71.08, achieved using bigram_shift_layer_4"
[3] "Glue task qqp, max rmse reduction 71.08, achieved using bigram_shift_layer_8"
---
title: "Predict GLUE performance"
output: html_notebook
---

First load the data
```{r}
library(tictoc)
library(caret)

df <- read.csv("../reports/probing_results_400_per_class/task1_predict_task_performance.csv")
dim(df)

all_glue_tasks = c("rte", "cola", "mrpc", "sst2", "qnli", "qqp")
all_probe_tasks = c("bigram_shift", "coordination_inversion", "obj_number", "odd_man_out", "past_present", "subj_number", "tree_depth")
```

## 1. Probing from all layers in one task

```{r}
all_layers_from_one_task <- function(glue_task, probe_task) {
  layers=1:12
  features = paste(paste(probe_task, "_layer_", sep=""), layers, sep="")
  x_y_features = c(glue_task, features)
  formula = as.formula(paste(glue_task, "~ ."))
  trcontrol <- trainControl(method="cv", number=5)
  model <- train(formula, data=df[x_y_features], method="lm", trControl=trcontrol)
  rmse <- sqrt(mean(summary(model)$residuals^2))
  
  ctrl_features <- matrix(rnorm(length(features) * nrow(df), 0, 0.1), 
                         nrow=nrow(df), ncol=length(features))
  ctrl_label <- df[glue_task]
  Z <- as.data.frame(cbind(ctrl_label, ctrl_features))
  ctrl_model <- train(
    as.formula(sprintf("%s ~ .", glue_task)), data=Z, method="lm", trControl=trcontrol)
  ctrl_rmse <- sqrt(mean(summary(ctrl_model)$residuals^2))
  
  SST <- var(df[glue_task]) * (length(df)-1)
  SSE <- deviance(model)
  return(list("RMSE"=rmse,
              "ctrl_RMSE"=ctrl_rmse,
              "RMSE_reduction"=(ctrl_rmse-rmse)/ctrl_rmse*100,
              "explained_var"=(SST-SSE) / SST * 100 ))
}

set.seed(1234)
for (gt in all_glue_tasks) {
  print(sprintf("Predict %s", gt))
  for (pt in all_probe_tasks) {
    ret = all_layers_from_one_task(gt, pt)
    print(sprintf("probing task %s. RMSE %.4f. ctrl_RMSE %.4f RMSE_reduction %.2f", pt, ret$RMSE, ret$ctrl_RMSE, ret$RMSE_reduction))
  }
}
```

## 2. Which features are significant?

```{r}
probing_from_one_task <- function(glue_task, probe_task) {
  layers=1:12
  features = paste(paste(probe_task, "_layer_", sep=""), layers, sep="")
  x_y_features = c(glue_task, features)
  formula = paste(glue_task, "~ .")
  model <- lm(formula, data=df[x_y_features])
  anova_result <- anova(model)
  rmse <- sqrt(mean(summary(model)$residuals^2))
  sig_features <- features[anova_result[,5]<0.05]
  
  ctrl_features <- matrix(rnorm(length(features) * nrow(df), 0, 0.1), 
                         nrow=nrow(df), ncol=length(features))
  ctrl_label <- df[glue_task]
  Z <- as.data.frame(cbind(ctrl_label, ctrl_features))
  ctrl_model <- lm(sprintf("%s ~ .", glue_task), data=Z)
  ctrl_rmse <- sqrt(mean(summary(ctrl_model)$residuals^2))
  
  SST <- var(df[glue_task]) * (length(df)-1)
  SSE <- deviance(model)
  return(list("anova_result"=anova_result, 
              "sig_features"=sig_features,
              "RMSE"=rmse,
              "RMSE_reduction"=(ctrl_rmse-rmse)/ctrl_rmse*100,
              "explained_var"=(SST-SSE) / SST * 100 ))
}

set.seed(1234)
for (gt in all_glue_tasks) {
  print(sprintf("Predict %s", gt))
  for (pt in all_probe_tasks) {
    ret = probing_from_one_task(gt, pt)
    anova_result = ret$anova_result
    sig_features = ret$sig_features
    print(sprintf("probing task %s", pt))
    print(sprintf(sig_features))
  }
}
```

## 3. Probing from some layers from some tasks
Just use one layer for each probing task.

```{r}

probing_some_layers_some_ptasks <- function(glue_task, features) {
  x_y_features = c(glue_task, features)
  formula = as.formula(paste(glue_task, "~ ."))
  # Need to convert to formula; otherwise caret throws error
  
  trctrl <- trainControl(method="cv", number=5)
  model <- train(formula, 
                 data=df[x_y_features], 
                 trControl=trctrl, 
                 method="lm")
  
  summary_result <- summary(model)
  rmse <- sqrt(mean(summary_result$residuals^2))
  
  ctrl_features <- matrix(rnorm(length(features) * nrow(df), 0, 0.1), 
                         nrow=nrow(df), ncol=length(features))
  ctrl_label <- df[glue_task]
  Z <- as.data.frame(cbind(ctrl_label, ctrl_features))
  ctrl_model <- train(
    as.formula(sprintf("%s ~ .", glue_task)), 
    data=Z, method="lm", 
    trControl=trainControl(method="cv", number=5))
  ctrl_rmse <- sqrt(mean(summary(ctrl_model)$residuals^2))
  if (ctrl_rmse == 0) {
    reduction = 0
  } else {
    reduction = (ctrl_rmse-rmse)/ctrl_rmse*100
  }

  return(list(
    "summary_result"=summary_result, 
    "RMSE"=rmse,
    "RMSE_reduction"=reduction ))
}

for (gt in all_glue_tasks) {
  features = c(
    "bigram_shift_layer_4",
    "coordination_inversion_layer_11",
    "obj_number_layer_3", 
    "odd_man_out_layer_6",  
    "past_present_layer_1",
    "subj_number_layer_5",
    "tree_depth_layer_1"  
  )
  ret <- probing_some_layers_some_ptasks(gt, features)
  print(sprintf("GLUE task %s, RMSE %.5f, RMSE_reduction %.2f", 
                gt, ret$RMSE, ret$RMSE_reduction))
}
```

## 4. Predict from just 3 features

Feature elimination:   
- Try a brute force iteration approach: This will take $84*83*82$ runs; Without 5-fold CV this takes around 10 mins per GLUE task. With CV: doesn't finish within 2 hrs; too long. Optimize a bit: Just use lm to select features. When report, report CV results.  
- Use the RFE by `caret`? The RMSE values are not as good as those from 12 features one ptask.   

```{r}
probing_some_layers_some_ptasks_fast <- function(glue_task, features) {
  x_y_features = c(glue_task, features)
  formula = as.formula(paste(glue_task, "~ ."))

  model <- lm(formula,data=df[x_y_features])
  
  summary_result <- summary(model)
  rmse <- sqrt(mean(summary_result$residuals^2))
  
  ctrl_features <- matrix(rnorm(length(features) * nrow(df), 0, 0.1), 
                         nrow=nrow(df), ncol=length(features))
  ctrl_label <- df[glue_task]
  Z <- as.data.frame(cbind(ctrl_label, ctrl_features))
  ctrl_model <- lm(sprintf("%s ~ .", glue_task), data=Z)
  ctrl_rmse <- sqrt(mean(summary(ctrl_model)$residuals^2))
  if (ctrl_rmse == 0) {
    reduction = 0
  } else {
    reduction = (ctrl_rmse-rmse)/ctrl_rmse*100
  }

  return(list(
    "summary_result"=summary_result, 
    "RMSE"=rmse,
    "RMSE_reduction"=reduction ))
}

all_probe_features <- outer(all_probe_tasks, paste0("_layer_", 1:12), FUN="paste0")

find_best_features <- function(glue_task) {
  best_features = NA
  smallest_rmse = 10000
  for (i in 1:(length(all_probe_features)-2)) {
    for (j in (i+1):(length(all_probe_features)-1)) {
      for (k in (j+1):length(all_probe_features)) {
        feats <- c(all_probe_features[i], all_probe_features[j], all_probe_features[k])
        ret <- probing_some_layers_some_ptasks_fast(glue_task, feats)
        if (ret$RMSE < smallest_rmse) {
          smallest_rmse = ret$RMSE
          best_features = feats
        }
      }
    }
  }
  ret <- probing_some_layers_some_ptasks(glue_task, best_features)
  return(list(
    "max_rmse_reduction"=ret$RMSE_reduction,
    "best_features"=best_features
  ))
}

for (gt in all_glue_tasks) {
  tic("find_best_features")
  retval = find_best_features(gt)
  toc()
  print(sprintf("Glue task %s, max rmse reduction %.2f, achieved using %s",
                gt, retval$max_rmse_reduction, retval$best_features))
}
```

