## ---------------------------
##
## Script name: json_webnlg_featextract
##
## Purpose of script: Get features from webnlg json files
##
## Date Modified: 2021-03-28
##
## ---------------------------

## set directory to where the R script is
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

## Required packages jsonlite,tidyverse,strex
#install.packages("jsonlite")
library(jsonlite)
library(tidyverse)
library(strex)
library(data.table)

## Read in train, dev and test json files

train <- fromJSON("processed/train.json") %>% as.data.frame %>% add_column(type="train") 

dev <- fromJSON("processed/dev.json") %>% as.data.frame %>% add_column(type="dev") 

test <- fromJSON("processed/test.json") %>% as.data.frame %>% add_column(type="test") 

## Bind train, dev and test dfs.
webnlg_df <- rbind(train,dev,test) %>% 
  as.data.frame() %>% 
  .[, c(3,6,5,1, 2,4)] %>% #rearrange columns
  mutate_if(is.factor, as.character) %>%  #change factor columns to character
  rowid_to_column(., "ID") # add an ID column 

## Remove train, dev and test dfs
rm(train,dev,test)

## Purpose of the following pipe: recreating triples from the pre and pos columns + recreating unique triples for each expression. 

webnlg <- webnlg_df %>% 
  as.data.frame() %>% 
  mutate_if(is.factor, as.character) %>% 
  mutate(pre_context_proc = ifelse(pre_context == 'character(0)', '' , 
                      ifelse(grepl( "c\\(", pre_context),str_extract(as.character(pre_context) , '".*"' ),
                             ifelse(grepl('"',pre_context),pre_context,paste('"',pre_context,'"', sep = '')  )))) %>% 
  mutate(pre_context_proc = str_remove_all(pre_context_proc, '"\\\\')) %>% 
  mutate(pre_context_proc = str_remove_all(pre_context_proc, '\\\\"')) %>% 
  mutate(pos_context_proc = ifelse(pos_context=='.', paste('"',pos_context,'"',sep = ''), str_extract(as.character(pos_context) , '".*"' ) )) %>% 
  mutate(pos_context_proc = str_remove_all(pos_context_proc, '\\\\"')) %>% 
  mutate(pos_context_proc = str_remove_all(pos_context_proc, '"\\\\')) %>% 
  mutate(entity_proc= ifelse(pre_context_proc=='', paste('"',entity,'", ', sep='') , paste(', "',entity,'", ', sep=''))) %>% 
  mutate(entity_unique_id = ifelse(pre_context_proc=='', paste('"',entity,'_ref',ID,'", ', sep='') , paste(', "',entity,'_ref',ID,'", ', sep=''))) %>% 
  mutate(triple= paste(pre_context_proc,entity_proc,pos_context_proc, sep = '')) %>%
  mutate(triple_unique= paste(pre_context_proc,entity_unique_id,pos_context_proc, sep = '')) %>%
  group_by(triple) %>% 
  mutate(triple_id = cur_group_id()) %>% 
  ungroup()

## Purpose of the following pipe: create a minimal dataframe which will be used for sentence and token segmentation. 
sent_token <- webnlg %>% 
  select(triple_unique,triple_id) %>% 
  mutate(unique_id = rownames(.)) #this unique id corresponds to each refex row number in webnlg dataframe. This way, webnlg and the segmentations can be easily be jointed together.


## Purpose of the following pipe: it looks through the sent_token dataframe and split a string when it gets to "." (a dot encircled into quotation marks). A look behind (?<=) is used to keep the dot delimiter attached to the first sentence. After it finds the dot; it splits the string and copy the rest in the next row. Since the transformation takes time (30-45 minutes), the result has been saved to a tab-separated text file.
## This segment each unique triple (a unique triple is the one in which only one expression is marked). Better approaches could have been used.

# sent_split <- as.data.frame(do.call(rbind, apply(sent_token, 1, function(x) {
#   do.call(expand.grid, strsplit(x, '(?<="\\.")', perl = TRUE))  
# })))
# 
# write_delim(sent_split,"webnlg_sent_segmented.txt", delim = "\t")  

## Read in the sentence segmented file.
sent_segmented <- read_delim("webnlg_sent_segmented.txt", delim ="\t")

## light processing on sentence segmented file. rows are grouped by unique_id (corresponding to each each refex), and number of sentences is counted.
sent_segmented <- sent_segmented %>% 
  group_by(unique_id) %>% 
  mutate(sentence_number = sequence(n()) ) %>% 
  ungroup()

## Purpose of the following pipe: token segmentation

# token_split <- as.data.frame(do.call(rbind, apply(sent_segmented, 1, function(x) {
#   do.call(expand.grid, strsplit(x, '(?<=", )', perl = TRUE))  
# })))


#write_delim(token_split,"webnlg_token_segmented.txt", delim = "\t")

## Read in the token segmented files
token_segmented <- read_delim("webnlg_token_segmented.txt", delim ="\t")

## processing of the token segmented df
token_segmented <- token_segmented %>% 
  group_by(triple_id, unique_id) %>% 
  mutate(token_in_triple_id = sequence(n()) ) %>%  #which token is the current token in the triple.
  mutate(triple_length = last(token_in_triple_id)) %>% # how many words does the triple have 
  ungroup() %>% 
  group_by(unique_id,sentence_number) %>% 
  mutate(token_in_sent_id = sequence(n()) ) %>% #which token is this token in sentence?
  mutate(sentence_length = last(token_in_sent_id) ) %>%  #length of the sentence
  ungroup()

## Only keeping the referring expressions and exclude all the other files 

target_tokens <- token_segmented %>% 
  filter(str_detect(triple_unique, "ref\\d+")) %>% 
  rename(ID=unique_id) %>% 
  mutate(ID= as.numeric(ID)) %>% 
  select(-c(triple_unique,triple_id))

## Purpose of the next pipe: extracting features 
## NOTE: there are cases where 'the differences are very very tiny. So the delexicalized form of the expressions look the same. I excluded those cases, because theyw ere giving me negative values. Hence, the original webnlg df has 94515 rows; while webnlg_with_feat has 93064 rows.

webnlg_with_feat <- webnlg %>% 
  left_join(.,target_tokens, by="ID") %>% #join webnlg df and the tokenized target_token df
  mutate(very_unique_id = paste(triple_id, '_', sentence_number,'_', token_in_triple_id, sep = '') ) %>%  #since the triples are made up of the entity (and not their corresponding referring expressions), there are cases where the triple string is the same, while they belong to different triples. The triple formation I have done here does not capture those cases. Hence, I excluded each second triple which has the same triple id, sentence and token id from the dataframe. Their inclusion led to negative distance values. 
  distinct(., very_unique_id, .keep_all = TRUE) %>% #excluding the second instance of the rows with the same "very_unique_id"
  group_by(triple_id) %>% 
  mutate(triple_sent_count = last(sentence_number) ) %>%  #how many sentence in the triple?
  mutate(markable_count = sequence(n())) %>% #which markable is the current markable? 
  mutate(distinct_referents = n_distinct(entity)) %>%  #how many distincts referents in the triple? 
  ungroup() %>% 
  group_by(triple_id,entity) %>% 
  mutate(triple_mention_count = n()) %>% #how many mentioned of the current REFERENT in triple?  
  mutate(which_mention = sequence(n())) %>% #which mention is the curremt REFEX?
  mutate(first_mention = ifelse(which_mention=='1', 'yes', 'no')) %>% #is it firth mention? 
  mutate(how_many_more_mention = triple_mention_count - which_mention) %>% #how many more mentions occur after the current refex?
  mutate(mention_order_cat = ifelse(which_mention=='1', 'first',
                                      ifelse(which_mention=='2', 'second',
                                             ifelse(which_mention==triple_mention_count,'last','middle')))) %>% #categorical value showing whether it's the first, second, middle or last mention?
  ungroup() %>% 
  mutate(first_sentence = ifelse(sentence_number =='1', 'yes', 'no')) %>% #is the refex ocurr in the first sentence of the triple?  
  group_by(triple_id,entity) %>%
  mutate(word_distance_num = token_in_triple_id - lag(token_in_triple_id) -1) %>% #numeric distance in words to the previous mention
  mutate(sent_distance_num = sentence_number - lag(sentence_number)) %>% #numeric distance in sentences to the previous mention
  mutate(markable_distance = markable_count- lag(markable_count) -1) %>%  #numeric distance in markables to the previous mention
  ungroup() %>% 
  mutate(w_dist_5_bins = ifelse(word_distance_num < 11, 'first',
                                ifelse(word_distance_num <21, 'second',
                                       ifelse(word_distance_num <31, 'third',
                                              ifelse(word_distance_num <41, 'fourth', 'fifth'))))) %>% #categorical distance in number of words
  mutate(w_dist_3_bins = ifelse(word_distance_num <6, 'first',
                                ifelse(word_distance_num <13, 'second', 'third'))) %>% #categorical distance in number of words
  
  mutate(sent_dist_2_bins = ifelse(sent_distance_num <2, 'first', 'second')) %>% #categorical distance in number of sentences
  mutate(sent_dist_3_bins = ifelse(sent_distance_num <1, 'first', 
                                   ifelse(sent_distance_num <2, 'second', 'third'))) %>% #categorical distance in number of sentences 
  mutate(same_prev_mention = ifelse(is.na(markable_distance),NA,
                                    ifelse(markable_distance=='0','yes','no'))) %>%  #does the previous refex refer to the same entity?
  mutate(same_sent_ante = ifelse(sent_distance_num =='0','yes','no')) %>%  #is antecedent in the same sentence? this is the negative value of: is ref new in the sentence?
  group_by(triple_id,sentence_number) %>% 
  mutate(diff_ref_sentence = n_distinct(entity)) %>%  #number of distinct entities in the sentence
  ungroup() %>% 
  group_by(entity) %>% 
  mutate(global_salience = n()) %>% 
  ungroup() 


gender <- read_lines("processed/gender.json") %>% as.data.frame() %>% 
  .[-1,] %>% as.data.frame() %>% 
  rename(entity='.') %>% 
  mutate(entity = as.character(entity)) %>%
  mutate(gender = str_after_first(entity,': ') %>% trimws() %>% str_remove_all(.,'"')) %>% 
  mutate(entity = str_before_first(entity,': ') %>% trimws() %>% str_remove_all(.,'"'))

entity_info <- read_lines("processed/entity_types.json") %>% as.data.frame() %>% 
  .[-1,] %>% as.data.frame() %>% 
  rename(entity='.') %>% 
  mutate(entity = as.character(entity)) %>%
  mutate(type = str_after_first(entity,': ') %>% trimws() %>% str_remove_all(.,'"')) %>% 
  mutate(entity = str_before_first(entity,': ') %>% trimws() %>% str_remove_all(.,'"')) %>% 
  left_join(.,gender, by="entity")


webnlg_with_feat <- webnlg_with_feat %>% 
  left_join(.,entity_info, by="entity")
