library(tidyverse)
library(gg.layers)
library(scales)
library(glue)
library(latex2exp)
library(kableExtra)
library(cowplot)
A = "FEMALE"
B = "MALE"
PMI_SMOOTHING=0.01
pmi_name = glue("pmi{PMI_SMOOTHING*100}")

Data

files_bias = list()
files_bias[["glove"]] = list.files(
  "results", pattern=glue("bias_glove-.+-{A}-{B}\\.csv"), full.names=T)
files_bias[["sgns"]] = list.files(
  "results", pattern=glue("bias_sgns-.+-{A}-{B}\\.csv"), full.names=T)
files_bias[[pmi_name]] = list.files(
  "results", pattern=glue("bias_pmi-.+-{A}-{B}-s{PMI_SMOOTHING}\\.csv"),
  full.names=T)
for (n in names(files_bias)) {
  corpus_names = str_match(
    files_bias[[n]], "bias.+-(.+)-FEMALE-MALE.*\\.csv")[,2]
  names(files_bias[[n]]) = corpus_names
}
# read data into nested list
dfs = list()
for (n in names(files_bias)) {
  files_ = files_bias[[n]]
  dfs[[n]] = list()
  for (corpus in names(files_)) {
    dfs[[n]][[corpus]] = read_csv(files_[corpus], show_col_types=F)
  }
}
# add frequency bins
add_frequency_bins = function(df) {
  log_freq = log10(df[["freq"]])
  max_value = max(log_freq)
  # cuts = c(seq(2, 6., 1.), max_value)
  cuts = c(seq(2, 6., .5), max_value)
  df = df %>% mutate(bins = cut(log_freq, cuts, include.lowest=T))
  return(df)
}

for (n in names(dfs)) {
  for (corpus in names(dfs[[n]])) {
    dfs[[n]][[corpus]] = add_frequency_bins(dfs[[n]][[corpus]])
  }
}
# concatenate dataframes
for (n in names(dfs)) {
  dfs[[n]] = bind_rows(dfs[[n]], .id="corpus")
}
# rename bias
dfs[[pmi_name]] = dfs[[pmi_name]] %>%
  rename(bias=dpmi)
dfs[["sgns"]] = dfs[["sgns"]] %>%
  rename(bias=bias_score)
dfs[["glove"]] = dfs[["glove"]] %>%
  rename(bias=bias_score)
# drop columns and aggregate shuffled wikipedia
clean_df = function(df, corpus_name="wiki2021") {
  df = df %>% select(corpus, idx, word, freq, bins, bias)
  df_original = df %>% filter(corpus == corpus_name)
  df_shuffled = df %>% filter(corpus != corpus_name)
  df_shuffled = df_shuffled %>%
    group_by(idx, word, bins) %>%
    summarise(
      bias = mean(bias),
      freq = max(freq)
    ) %>%
    ungroup() %>%
    mutate(corpus = glue("{corpus_name}_shuffled"))
  df_final = bind_rows(df_original, df_shuffled)
  return(df_final)
}

for (n in names(dfs)) {
  dfs[[n]] = clean_df(dfs[[n]])
}
## `summarise()` has grouped output by 'idx', 'word'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'idx', 'word'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'idx', 'word'. You can override using the `.groups` argument.

Explore

# frequencies of the context words
df_freq = read_delim(
  "data/working/vocab-wiki2021-V100.txt", delim=" ", show_col_types=F,
  col_names=c("Word", "Frequency"))
female_words = readLines("words_lists/FEMALE.txt")
male_words = readLines("words_lists/MALE.txt")

tab_female = df_freq %>% filter(Word %in% c(female_words))
cat(tab_female %>% kableExtra::kable(format="latex", booktabs=T))
## 
## \begin{tabular}{lr}
## \toprule
## Word & Frequency\\
## \midrule
## her & 3720408\\
## she & 3517570\\
## daughter & 294043\\
## female & 282159\\
## woman & 236954\\
## \addlinespace
## sister & 179511\\
## girl & 141616\\
## hers & 5706\\
## \bottomrule
## \end{tabular}
tab_male = df_freq %>% filter(Word %in% c(male_words))
cat(tab_male %>% kableExtra::kable(format="latex", booktabs=T))
## 
## \begin{tabular}{lr}
## \toprule
## Word & Frequency\\
## \midrule
## he & 11815189\\
## his & 9603118\\
## him & 1811552\\
## son & 541828\\
## man & 443881\\
## \addlinespace
## brother & 287544\\
## male & 181471\\
## boy & 124326\\
## \bottomrule
## \end{tabular}
# words in each frequency bin
clean_log_labels = function(labs) {
  labs = str_replace(labs, ",", "},10^{")
  labs = str_replace(labs, "([\\[\\(])", "\\110^{")
  labs = str_replace(labs, "([\\]\\)])", "}\\1")
  # labs = str_replace(labs, "(\\])", r"(\\])")
  # labs = str_replace(labs, "^(\\[)", r"(\\[)")
  return(labs)
}

df_ = dfs$glove %>% filter(corpus == "wiki2021") 
clean_labels = clean_log_labels(levels(df_[["bins"]]))
levels(df_[["bins"]]) = clean_labels

tab = df_ %>% 
  pull(bins) %>% 
  table() %>% 
  as_tibble() %>% 
  setNames(c("Frequency", "# types"))

cat(tab %>% kableExtra::kable(format="latex", booktabs = TRUE))
## 
## \begin{tabular}{lr}
## \toprule
## Frequency & \# types\\
## \midrule
## {}[10\textasciicircum{}\{2\},10\textasciicircum{}\{2.5\}] & 116340\\
## (10\textasciicircum{}\{2.5\},10\textasciicircum{}\{3\}] & 54187\\
## (10\textasciicircum{}\{3\},10\textasciicircum{}\{3.5\}] & 26617\\
## (10\textasciicircum{}\{3.5\},10\textasciicircum{}\{4\}] & 13144\\
## (10\textasciicircum{}\{4\},10\textasciicircum{}\{4.5\}] & 6579\\
## \addlinespace
## (10\textasciicircum{}\{4.5\},10\textasciicircum{}\{5\}] & 3255\\
## (10\textasciicircum{}\{5\},10\textasciicircum{}\{5.5\}] & 1448\\
## (10\textasciicircum{}\{5.5\},10\textasciicircum{}\{6\}] & 441\\
## (10\textasciicircum{}\{6\},10\textasciicircum{}\{8.12\}] & 117\\
## \bottomrule
## \end{tabular}
# number of biased words

dfs$pmi1 %>%
  filter(corpus == "wiki2021") %>%
  pull(bias) %>%
  {ifelse(. >= 0, "FEMALE", "MALE")} %>%
  table()
## .
## FEMALE   MALE 
##  85407 136721

Plots

Boxplots

# boxplots
boxplots_plt = function(df, ylab="Female bias", xlab=NULL,
                        title=NULL, subtitle=NULL, effect_sizes=F) {
  # bin labels
  labs = levels(df[["bins"]])
  labs = str_replace(labs, ",", "},10^{")
  labs = str_replace(labs, "([\\[\\(])", "\\1$10^{")
  labs = str_replace(labs, "([\\]\\)])", "}$\\1")
  labs = str_replace(labs, "(\\])$", r"(\\])")
  labs = str_replace(labs, "^(\\[)", r"(\\[)")
  labs = lapply(sprintf(r'(%s)', labs), TeX)
  labs = unlist(labs)
  # plot
  p = ggplot(df, aes(x=bins, y=bias)) +
    gg.layers::geom_boxplot2(
      width.errorbar=0.2, fill="lightblue", color="black") +
    stat_summary(fun="mean", color="navy") +    
    geom_hline(yintercept=0, color="black", linetype="dashed") +
    labs(x=xlab, y=ylab, title=title, subtitle=subtitle) +
    scale_x_discrete(labels=labs, guide=guide_axis(angle=35)) +
    theme_minimal() +
    theme(
      axis.title.x=element_text(size=15), axis.title.y=element_text(size=16),
      axis.text=element_text(size=14), strip.text=element_text(size=15),
      plot.subtitle=element_text(size=16)
      ) +
    NULL
  if (effect_sizes == T) {
    df_effect_sizes = df %>%
      group_by(bins) %>%
      summarise(mean_bias = mean(bias), sd_bias = sd(bias)) %>%
      mutate(ef = mean_bias / sd_bias)
    y_limits = layer_scales(p)$y$get_limits()
    y_adj = (y_limits[2] - y_limits[1]) * 0.07
    y_text = y_limits[2] + y_adj
    p = p +
      geom_text(
        data=df_effect_sizes,
        aes(x=bins, label=round(ef, 2)), y=y_text, color="navy", size=5) +
      lims(y = c(NA, y_text)) +
      NULL
  }
  return(p)
}
model_names = c("pmi1"="PMI", "sgns"="SGNS", "glove"="GloVe")
last_name = names(dfs)[length(names(dfs))]

for (corpus_ in c("wiki2021", "wiki2021_shuffled")) {
  plot_list_ = list()
  for (n in names(dfs)) {
    name = model_names[n]
    x_label = NULL
    if (n == last_name) x_label = "Frequency"
    p_ = boxplots_plt(
      dfs[[n]] %>% filter(corpus == corpus_), ylab=glue("Female bias ({name})"),
      xlab=x_label, effect_sizes=T)
    plot_list_[[n]] = p_
    print(p_)
  }
  outname = glue("results/plots/boxplots_{corpus_}.png")
  grid_ = plot_grid(plotlist=plot_list_, ncol=1)
  save_plot(outname, grid_, base_height=10, base_width=6, dpi=300)
}

# NOTE we dont use facet_wrap because it is hard to use it with geom_text