#!/usr/bin/env bash
#
# Consistency test sets prepared by Voita et al., 2019 (When a Good Translation is Wrong in Context)
#
# bash sh/prepare/en-ru/voita_opensubs/consistency.sh deixis_dev (--case=true)
# bash sh/prepare/en-ru/voita_opensubs/consistency.sh deixis_test (--case=true)
# bash sh/prepare/en-ru/voita_opensubs/consistency.sh lex_cohesion_dev (--case=true)
# bash sh/prepare/en-ru/voita_opensubs/consistency.sh lex_cohesion_test (--case=true)
# bash sh/prepare/en-ru/voita_opensubs/consistency.sh ellipsis_infl (--case=true)
# bash sh/prepare/en-ru/voita_opensubs/consistency.sh ellipsis_vp (--case=true)

# Read script arguments and assign them to variables
for argument in "$@" 
do
    key=$(echo $argument | cut -f1 -d=)
    value=$(echo $argument | cut -f2 -d=)   
    if [[ $key == *"--"* ]]; then
        v="${key/--/}"
        declare $v="${value}" 
   fi
done
# Set true-case/lower-case option
if [ -n "$case" ]; then case=$case ; else case=lower ; fi



# Move to the data directory corresponding to the right language pair
src=en
tgt=ru
lang=$src-$tgt
DATA=$HOME/dev/fairseq/data/$lang
mkdir -p $DATA
cd $DATA

# Setting variables
if [ $case = "lower" ]
then
    corpus=voita_opensubs/testset_consistency
    CODE_SOURCE_DIR=voita_opensubs/context_agnostic/standard
else
    corpus=$case\_voita_opensubs/testset_consistency
    CODE_SOURCE_DIR=$case\_voita_opensubs/context_agnostic/standard
fi
orig=test_suites/good-translation-wrong-in-context/consistency_testsets/scoring_data
BPE_CODE=$CODE_SOURCE_DIR/code
BPE_TOKENS=32000
N_THREADS=8

# Standard variables
TOOLS=../../tools
BPEROOT=$TOOLS/subword-nmt/subword_nmt

HEADS=../../scripts/retrieve_doc_heads.py


# Setting variables for the current option
set=$1
prep=$corpus/$set
tmp=$prep/tmp
mkdir -p $tmp

echo "Pre-processing data..."
if [ $case = "lower" ]
then
    echo "Text is being lowercased!"
    # add a blank line between blocks of sentences, replace _eos with newline, lowercase
    cat $orig/$set.src | \
    awk '{print $0,"\n"}' | \
    sed "s/ _eos /\n/g" | \
    python -c "import sys; print(sys.stdin.read().lower())" > $tmp/test.$src
    cat $orig/$set.dst | \
    awk '{print $0,"\n"}' | \
    sed "s/ _eos /\n/g" | \
    python -c "import sys; print(sys.stdin.read().lower())" > $tmp/test.$tgt
else
    # add a blank line between blocks of sentences, replace _eos with newline
    cat $orig/$set.src | awk '{print $0,"\n"}' | sed "s/ _eos /\n/g" > $tmp/test.$src
    cat $orig/$set.dst | awk '{print $0,"\n"}' | sed "s/ _eos /\n/g" > $tmp/test.$tgt
fi

# retrieve indices of headlines
for l in $src $tgt; do
    python $HEADS $tmp/test.$l
    mv $tmp/test.$l.heads $prep/test.$lang.$l.heads
done

echo "Applying BPEs..."
for l in $src $tgt; do
    python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/test.$l > $prep/test.$l
done

echo "Building vocabulary and binarizing data..." 
rm -rf data-bin/$prep
fairseq-preprocess \
    --source-lang $src \
    --target-lang $tgt \
    --testpref $prep/test \
    --srcdict data-bin/$CODE_SOURCE_DIR/dict.en.txt \
    --joined-dictionary \
    --destdir data-bin/$prep \
    --workers $N_THREADS
cp $prep/*.heads data-bin/$prep/