#!/usr/bin/env bash
set -e

# bash binary.sh /home/tiger/summ en MSPM

DATADIR="$1"
LG="$2"
MODE="$3"
DIVISIONS="$4"

cd ../..

if [ -z "$DATADIR" ]; then
    echo "Lose the raw data dir!"
    exit
fi
if [ -z "$LG" ]; then
    echo "Lose the language!"
    exit
fi

# TOKEN="$DATADIR/$MODE/$LG"
TOKEN="$DATADIR/$MODE"


if [[ "$MODE" == "MSPM" ]]; then
  # MBART=/data00/home/wangdanqing.122/workshop/MultiLingual/train/pretrain/mbart/mbart.cc25
  MBART=/home/tiger/mbart.cc25
  if [ ! -d $MBART ]; then
    hadoop fs -copyToLocal hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/pretrained/mbart.CC25.tar.gz /home/tiger
    tar -xvzf /home/tiger/mbart.CC25.tar.gz -C /home/tiger
  fi
  MODEL=$MBART/sentence.bpe.model
  DICT=$MBART/dict_extend.txt

elif [[ "$MODE" == "BPE" ]]; then
  wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
  wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
  wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'

  DICT="dict.txt"

elif [[ "$MODE" == "PMNMT" ]]; then
  # MODEL=toutiaomt/codes.bpe.32000
  # DICT=toutiaomt/vocab.bpe.32000
  LANGID="LANG_TOK_"$(echo "${LG}" | tr '[a-z]' '[A-Z]')
  MODEL=~/mCOLT/vocab/codes.bpe.32000
  DICT=~/mCOLT/vocab/dict.txt
fi

echo "Generating data-bin for dataset..."
  INPUT="$TOKEN"
  BINDIR="$DATADIR/data-bin/$MODE/$LG"
  if [[ "$MODE" == "SPM" ]] || [[ "$MODE" == "MSPM" ]]; then
    TOKENTYPE="spm"
  else
    TOKENTYPE="bpe"
  fi

  echo "Binarized $INPUT ($TOKENTYPE) to $BINDIR with dict $DICT"

echo $DICT
echo $MODE

python3 fairseq/preprocess.py \
    --source-lang doc \
    --target-lang sum \
    --trainpref "$INPUT/$LG.dev.$TOKENTYPE" \
    --validpref "$INPUT/$LG.dev.$TOKENTYPE" \
    --destdir "$BINDIR" \
    --srcdict "$DICT" \
    --tgtdict "$DICT" \
    --workers 70
