#!/bin/bash

# #############################################################################
# Requirements:
#  * sacremoses
#  * fastBPE
# #############################################################################
FAST_BPE=$HOME/local/fastBPE/bin


# =============================================================================
# Training Data
# - en-de WMT14 data
# - en-cs WMT17 data
# =============================================================================
mkdir tmp_data
cd tmp_data
#
mkdir ende encs

wget http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz

tar -zxvf training-parallel-nc-v13.tgz training-parallel-nc-v13/news-commentary-v13.de-en.en
tar -zxvf training-parallel-nc-v13.tgz training-parallel-nc-v13/news-commentary-v13.de-en.de
tar -zxvf training-parallel-nc-v13.tgz training-parallel-nc-v13/news-commentary-v13.cs-en.en
tar -zxvf training-parallel-nc-v13.tgz training-parallel-nc-v13/news-commentary-v13.cs-en.cs
rm training-parallel-nc-v13.tgz

mv training-parallel-nc-v13/news-commentary-v13.de-en.{en,de} ende/
mv training-parallel-nc-v13/news-commentary-v13.cs-en.{en,cs} encs/

rmdir training-parallel-nc-v13

# # -----------------------------------------------------------------------------

wget http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz

tar -zxvf training-parallel-commoncrawl.tgz commoncrawl.de-en.en
tar -zxvf training-parallel-commoncrawl.tgz commoncrawl.de-en.de
tar -zxvf training-parallel-commoncrawl.tgz commoncrawl.fr-en.en
tar -zxvf training-parallel-commoncrawl.tgz commoncrawl.fr-en.fr
tar -zxvf training-parallel-commoncrawl.tgz commoncrawl.cs-en.en
tar -zxvf training-parallel-commoncrawl.tgz commoncrawl.cs-en.cs
rm training-parallel-commoncrawl.tgz

mv commoncrawl.de-en.{en,de} ende/
mv commoncrawl.fr-en.{en,fr} enfr/
mv commoncrawl.cs-en.{en,cs} encs/

# -----------------------------------------------------------------------------

wget http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz

tar -zxvf training-parallel-europarl-v7.tgz training/europarl-v7.de-en.en
tar -zxvf training-parallel-europarl-v7.tgz training/europarl-v7.de-en.de
tar -zxvf training-parallel-europarl-v7.tgz training/europarl-v7.cs-en.en
tar -zxvf training-parallel-europarl-v7.tgz training/europarl-v7.cs-en.cs
tar -zxvf training-parallel-europarl-v7.tgz training/europarl-v7.fr-en.en
tar -zxvf training-parallel-europarl-v7.tgz training/europarl-v7.fr-en.fr
rm training-parallel-europarl-v7.tgz

mv training/europarl-v7.de-en.{en,de} ende/
mv training/europarl-v7.cs-en.{en,cs} encs/
mv training/europarl-v7.fr-en.{en,fr} enfr/
rmdir training

# -----------------------------------------------------------------------------

wget http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz
tar -zxvf training-parallel-nc-v9.tgz training/news-commentary-v9.fr-en.en
tar -zxvf training-parallel-nc-v9.tgz training/news-commentary-v9.fr-en.fr
rm training-parallel-nc-v9.tgz

sed -i 's/\r//g' training/*
mv training/news-commentary-v9.fr-en.{en,fr} enfr/
rmdir training

# -----------------------------------------------------------------------------

wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-1458/data-plaintext-format.tar
tar xvf data-plaintext-format.tar
zcat data.plaintext-format/*train.gz > czeng.tsv
cut -f 3 czeng.tsv > encs/czeng.cs
cut -f 4 czeng.tsv > encs/czeng.en
chmod -R +w data.plaintext-format
rm -rf data.plaintext-format

# -----------------------------------------------------------------------------

# COLLECT AND TOKENIZE EVERYTHING

for TGT in de cs fr; do
    FINAL_DIR=en${TGT}_final
    mkdir $FINAL_DIR
    cd en$TGT
    dos2unix *
    paste <(cat *.en) <(cat *.$TGT) | shuf > all.en$TGT
    cut -f1 all.en$TGT > ../$FINAL_DIR/en
    cut -f2 all.en$TGT > ../$FINAL_DIR/$TGT

    cd ../$FINAL_DIR
    sacremoses tokenize -l en < en > en.tok
    sacremoses tokenize -l $TGT < $TGT > $TGT.tok
    cd ..
done

cd ..
mkdir -p data/en{de,cs,fr}/{train,val,test}


for TGT in de cs fr; do
    mv tmp_data/en${TGT}_final data/en${TGT}/train
done

# =============================================================================
# Download Turkish
# =============================================================================

wget http://opus.nlpl.eu/download.php?f=SETIMES/v2/moses/en-tr.txt.zip -O en-tr.txt.zip

unzip -p en-tr.txt.zip SETIMES.en-tr.en > data/entr/train/en
unzip -p en-tr.txt.zip SETIMES.en-tr.tr > data/entr/train/tr
rm en-tr.txt.zip

# =============================================================================
# Validation and test data
# =============================================================================

sacrebleu -t wmt13 -l en-de --echo src > data/ende/val/en
sacrebleu -t wmt13 -l en-de --echo ref > data/ende/val/de
sacrebleu -t wmt14 -l en-de --echo src > data/ende/test/en
sacrebleu -t wmt14 -l en-de --echo ref > data/ende/test/de

sacrebleu -t wmt17 -l en-cs --echo src > data/encs/val/en
sacrebleu -t wmt17 -l en-cs --echo ref > data/encs/val/cs
sacrebleu -t wmt18 -l en-cs --echo src > data/encs/test/en
sacrebleu -t wmt18 -l en-cs --echo ref > data/encs/test/cs

sacrebleu -t wmt17 -l en-tr --echo src > data/entr/val/en
sacrebleu -t wmt17 -l en-tr --echo ref > data/entr/val/tr
sacrebleu -t wmt18 -l en-tr --echo src > data/entr/test/en
sacrebleu -t wmt18 -l en-tr --echo ref > data/entr/test/tr

# =============================================================================
# Apply BPE on everything
# =============================================================================

for F in data/*/{train,test,val,val_full}/??; do
    echo $F
    ./wordpiece_tokenize.py $F > $F.wtok
done

for PAIR in ende enfr encs; do
    $FAST_BPE/fast learnbpe 32000 data/$PAIR/train/??.wtok > data/$PAIR/wbpe32k;
done

for PAIR in ende encs entr; do
    for KSIZE in 16 8 4 2 1; do
        head -n ${KSIZE}000 data/$PAIR/wbpe32k > data/$PAIR/wbpe${KSIZE}k
    done

    for SIZE in {950..0..50}; do
        head -n ${KSIZE} data/$PAIR/wbpe1k > data/$PAIR/wbpe${SIZE}
    done
done

for PAIR in ende encs entr; do
    for SIZE in 32k 16k 8k 4k 2k 1k {950..0..50}; do
        CODES=data/$PAIR/wbpe${SIZE}
        for FILE in data/$PAIR/*/*.wtok; do
            echo $FILE
            OUTFILE=${FILE:0:-5}.bpew${SIZE}
            $FAST_BPE/fast applybpe $OUTFILE $FILE $CODES
            sed -i 's/@@ / /g' $OUTFILE
        done
    done
done

# Prepare the data wtih sampled noise
git clone https://github.com/ybisk/charNMT-noise

for TGT in de cs tr; do
    PAIR=en${TGT}
    for LNG in en $TGT; do
        NOISY_FILE=data/$PAIR/test/$LNG.wtok.noisy
        for NOISE in 0.{1..9} 1.0; do
            ./sample_natural_noise.py charNMT-noise/noise/$LNG.natural $NOISE data/$PAIR/test/$LNG.wtok >> $NOISY_FILE
        done
        for SIZE in 32k 16k 8k 4k 2k 1k {950..0..50}; do
            CODES=data/$PAIR/wbpe${SIZE}
            $FAST_BPE/fast applybpe $NOISY_FILE.$SIZE $NOISY_FILE $CODES
            sed -i 's/@@ / /g' $NOISY_FILE.$SIZE
        done
    done
done

# Downlaod everything for Morpheval evaluation

mkdir morpheval
cd morpheval

wget https://morpheval.limsi.fr/morpheval.limsi.v2.en.info
wget https://morpheval.limsi.fr/morpheval.limsi.v2.en.sents
wget https://github.com/ufal/morphodita/releases/download/v1.3.0/morphodita-1.3.0-bin.zip
unzip morphodita-1.3.0-bin.zip

wget https://www.cis.uni-muenchen.de/~schmid/tools/SMOR/data/SMOR-linux.tar.gz
tar zxvf SMOR-linux.tar.gz

wget https://morpheval.limsi.fr/lefff.pkl

../wordpiece_tokenize.py morpheval.limsi.v2.en.sents > morpheval.limsi.v2.en.sents.wtok

for PAIR in ende encs enfr; do
    for BPE in ../data/$PAIR/wbpe*; do
        echo $BPE
        $FAST_BPE/fast applybpe_stream $BPE < morpheval.limsi.v2.en.sents.wtok | sed -e 's/@@ / /g' > segmented/$PAIR/sents.$PAIR.${BPE:13:1000}
    done
done

git clone https://github.com/franckbrl/morpheval_v2

curl --remote-name-all https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-1836{/czech-morfflex-pdt-161115.zip}
unzip czech-morfflex-pdt-161115.zip

git clone https://github.com/cmu-mtlab/meteor
cd meteor
ant
