#!/usr/bin/env bash
set -e

# bash binary_shuffled_data.sh /home/tiger/Datasets/multilingual lang5 MSPM

DATADIR="$1"
LG="$2"
MODE="$3"
DIVISIONS="$4"

if [ -z "$DATADIR" ]; then
    echo "Lose the raw data dir!"
    exit
fi
if [ -z "$LG" ]; then
    echo "Lose the language!"
    exit
fi
if [[ -n "$DIVISIONS" ]] && [[ "$DIVISIONS" == "test" ]]; then
    PARTS=(test)
else
    # PARTS=(train dev test)
    PARTS=(train dev)
fi
echo "Parts are : ${PARTS[*]}"

cd /opt/tiger/code/multilingual
TOKEN="$DATADIR/$MODE/$LG"

echo "Generating data-bin for dataset..."
INPUT="$TOKEN"
BINDIR="$DATADIR/data-bin/$MODE/$LG"
if [[ "$MODE" == "SPM" ]] || [[ "$MODE" == "MSPM" ]]; then
  TOKENTYPE="spm"
else
  TOKENTYPE="bpe"
fi

MBART=/home/tiger/mbart.cc25
DICT=$MBART/dict_extend.txt

echo "Binarized $INPUT ($TOKENTYPE) to $BINDIR with dict $DICT"

if [ ${#PARTS[*]} == 2 ]; then
  python3 fairseq/preprocess.py \
  --source-lang doc \
  --target-lang sum \
  --trainpref "$INPUT/train.$LG.$TOKENTYPE" \
  --validpref "$INPUT/dev.$LG.$TOKENTYPE" \
  --destdir "$BINDIR" \
  --srcdict "$DICT" \
  --tgtdict "$DICT" \
  --workers 70
else
  python3 fairseq/preprocess.py \
    --source-lang doc \
    --target-lang sum \
    --testpref "$INPUT/test.$LG.$TOKENTYPE"  \
    --destdir "$BINDIR" \
    --thresholdtgt 0 \
    --thresholdsrc 0 \
    --srcdict "$DICT" \
    --tgtdict "$DICT" \
    --workers 70
fi