#!/usr/bin/env bash
#
# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh

SCRIPTS=mosesdecoder/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
LC=$SCRIPTS/tokenizer/lowercase.perl
CLEAN=$SCRIPTS/training/clean-corpus-n.perl
BPEROOT=subword-nmt/subword_nmt
NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
BPE_TOKENS=10000

src=qu
tgt=re
lang=qu-re
prep=stc-nhh-tok
orig=stc-nhh
tmp=$prep/tmp


rm -rf $prep/*
mkdir -p  $prep $tmp 

for fn in train valid test; do
    echo "pre-processing ${fn} data..."
    for n in $src $tgt; do
        echo $orig/$fn.$n
        cat $orig/$fn.$n | \
        perl $NORM_PUNC $n | \
        perl $REM_NON_PRINT_CHAR | \
        perl $TOKENIZER -threads 32 -a -l en >> $tmp/$fn.$n
        echo $tmp/$fn.$n
        done
done

TRAIN=$tmp/train.qu-re
BPE_CODE=$prep/code
rm -f $TRAIN
for n in $src $tgt; do
    cat $tmp/train.$n >> $TRAIN
done

echo "learn_bpe.py on ${TRAIN}..."
python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE

for L in $src $tgt; do
    for f in train.$L valid.$L test.$L; do
        echo "apply_bpe.py to ${f}..."
        python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f
    done
done

cd $prep
fairseq-preprocess --source-lang qu --target-lang re --trainpref train --validpref valid --testpref test --joined-dictionary --workers 32