#!/bin/sh


# Download corpus data from Europarl
mkdir corpus
cd corpus
if [ ! -e es-en.tgz ]; then
  wget http://www.statmt.org/europarl/v7/es-en.tgz
fi
tar zxvf es-en.tgz
cd -

# Download tokenizer tool
if [ ! -e tools.tgz ]; then
  wget http://www.statmt.org/europarl/v7/tools.tgz
fi
tar zxvf tools.tgz

# Tokenize and convert to lowercase and extract the first 500,000 lines
./tools/tokenizer.perl -l es < ./corpus/europarl-v7.es-en.es | awk '{print tolower($0)}' | awk 'NR <= 500000' > ./corpus/europarl-v7.es-en.es.tokenized.500000
./tools/tokenizer.perl -l en < ./corpus/europarl-v7.es-en.en | awk '{print tolower($0)}' | awk 'NR <= 500000' > ./corpus/europarl-v7.es-en.en.tokenized.500000


# Execute CL-Eigenwords
time R -e "rmarkdown::render('run_cleigenwords.Rmd', output_file = 'run_cleigenwords_40_80_3.html', params = list(dim='40', dim.evd='80', vsdoc='3'))"
time R -e "rmarkdown::render('run_cleigenwords.Rmd', output_file = 'run_cleigenwords_100_200_3.html', params = list(dim='100', dim.evd='200', vsdoc='3'))"
time R -e "rmarkdown::render('run_cleigenwords.Rmd', output_file = 'run_cleigenwords_200_400_3.html', params = list(dim='200', dim.evd='400', vsdoc='3'))"

# Execute BilBOWA
bash run_bilbowa_multiple_dim.sh

# Render Rmarkdown of experiment for ACL
time R -e "rmarkdown::render('experiment_cleigenwords_taskeval.Rmd', output_file = 'experiment_cleigenwords_taskeval_es-en_from-es_40.html', params = list(source='es', target='en', dim='40', dim.evd='80'))"
time R -e "rmarkdown::render('experiment_cleigenwords_taskeval.Rmd', output_file = 'experiment_cleigenwords_taskeval_es-en_from-en_40.html', params = list(source='en', target='es', dim='40', dim.evd='80'))"

time R -e "rmarkdown::render('experiment_cleigenwords_taskeval.Rmd', output_file = 'experiment_cleigenwords_taskeval_es-en_from-es_100.html', params = list(source='es', target='en', dim='100', dim.evd='200'))"
time R -e "rmarkdown::render('experiment_cleigenwords_taskeval.Rmd', output_file = 'experiment_cleigenwords_taskeval_es-en_from-en_100.html', params = list(source='en', target='es', dim='100', dim.evd='200'))"

time R -e "rmarkdown::render('experiment_cleigenwords_taskeval.Rmd', output_file = 'experiment_cleigenwords_taskeval_es-en_from-es_200.html', params = list(source='es', target='en', dim='200', dim.evd='400'))"
time R -e "rmarkdown::render('experiment_cleigenwords_taskeval.Rmd', output_file = 'experiment_cleigenwords_taskeval_es-en_from-en_200.html', params = list(source='en', target='es', dim='200', dim.evd='400'))"
