#!/usr/bin/env bash
set -e

# bash binary.sh /home/tiger/summ en MSPM

DATADIR="$1"
LG="$2"
MODE="$3"
DIVISIONS="$4"

cd ../..

if [ -z "$DATADIR" ]; then
    echo "Lose the raw data dir!"
    exit
fi
if [ -z "$LG" ]; then
    echo "Lose the language!"
    exit
fi
if [[ -n "$DIVISIONS" ]] && [[ "$DIVISIONS" == "test" ]]; then
    PARTS=(test)
else
    PARTS=(train dev test)
fi
echo "Parts are : ${PARTS[*]}"

TOKEN="$DATADIR/$MODE/$LG"
# if [ ! -d "$TOKEN"  ] ; then
#     mkdir -p "$TOKEN"
# fi

if [[ "$MODE" == "SPM" ]]; then
  if [ ! -f "$DATA/sentence.bpe.$VOCAB_SIZE.$LG.model" ] ; then
    echo "SPM training for Language..."
    if [[ "$LG" == "zh" ]] || [[ "$LG" == "ja" ]] ; then
      python fairseq/scripts/spm_train.py \
      --input="$DATA/lines.txt" \
      --model_prefix="$DATA/sentence.bpe.$VOCAB_SIZE.$LG" \
      --vocab_size=$VOCAB_SIZE \
      --character_coverage=0.995 \
      --model_type=bpe \
      --user_defined_symbols='<q>'
    else
      python fairseq/scripts/spm_train.py \
      --input="$DATA/lines.txt" \
      --model_prefix="$DATA/sentence.bpe.$VOCAB_SIZE.$LG" \
      --vocab_size=$VOCAB_SIZE \
      --character_coverage=1 \
      --model_type=bpe \
      --user_defined_symbols='<q>'
    fi
  fi

  echo "SPM encoding for dataset..."
  for SPLIT in ${PARTS[*]}
    do
      echo "  encoding $DATA/$SPLIT.$LG.doc to $TOKEN/$SPLIT.$LG.spm.doc ..."
      python fairseq/scripts/spm_encode.py --model="$DATA/sentence.bpe.$VOCAB_SIZE.$LG.model" < "$DATA/$SPLIT.$LG.doc" | sed "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${TAG}] /" > "$TOKEN/$SPLIT.$LG.spm.doc"
      echo "  encoding $DATA/$SPLIT.$LG.sum to $TOKEN/$SPLIT.$LG.spm.sum ..."
      python fairseq/scripts/spm_encode.py --model="$DATA/sentence.bpe.$VOCAB_SIZE.$LG.model" < "$DATA/$SPLIT.$LG.sum" | sed "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${TAG}] /"> "$TOKEN/$SPLIT.$LG.spm.sum"
    done

  echo "Changing vocab to dict..."
  python -m preprocess.vocab2dict -i "$DATA/sentence.bpe.$VOCAB_SIZE.$LG.vocab" -o "$DATA/$LG.dict.txt" -l "$LG"
  DICT="$DATA/$LG.dict.txt"


elif [[ "$MODE" == "MSPM" ]]; then
  # MBART=/data00/home/wangdanqing.122/workshop/MultiLingual/train/pretrain/mbart/mbart.cc25
  MBART=/home/tiger/mbart.cc25
  if [ ! -d $MBART ]; then
    hadoop fs -copyToLocal hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/pretrained/mbart.CC25.tar.gz /home/tiger
    tar -xvzf /home/tiger/mbart.CC25.tar.gz -C /home/tiger
  fi
  MODEL=$MBART/sentence.bpe.model
  DICT=$MBART/dict_extend_extra2.txt

elif [[ "$MODE" == "BPE" ]]; then
  wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
  wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
  wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'

  DICT="dict.txt"

elif [[ "$MODE" == "PMNMT" ]]; then
  # MODEL=toutiaomt/codes.bpe.32000
  # DICT=toutiaomt/vocab.bpe.32000
  LANGID="LANG_TOK_"$(echo "${LG}" | tr '[a-z]' '[A-Z]')
  MODEL=~/mCOLT/vocab/codes.bpe.32000
  DICT=~/mCOLT/vocab/dict.txt
fi

echo "Generating data-bin for dataset..."
  INPUT="$TOKEN"
  BINDIR="$DATADIR/data-bin/$MODE/$LG"
  if [[ "$MODE" == "SPM" ]] || [[ "$MODE" == "MSPM" ]]; then
    TOKENTYPE="spm"
  else
    TOKENTYPE="bpe"
  fi

  echo "Binarized $INPUT ($TOKENTYPE) to $BINDIR with dict $DICT"

echo $DICT
echo $MODE

python3 fairseq/preprocess.py \
    --source-lang doc \
    --target-lang sum \
    --trainpref "$INPUT/train.$TOKENTYPE" \
    --validpref "$INPUT/dev.$TOKENTYPE" \
    --destdir "$BINDIR" \
    --srcdict "$DICT" \
    --tgtdict "$DICT" \
    --workers 70
