#!/usr/bin/env bash
set -e

# bash processTransTest.sh /home/tiger/wmt14/test en en_XX fr fr_XX

cd ../../

DATADIR="$1"
LG1="$2"
LG_TAG1="$3"
LG2="$4"
LG_TAG2="$5"
MODE="$6"

TASK_TAG="tran"

if [ -z "$DATADIR" ]; then
    echo "Lose the raw data dir!"
    exit
fi
# if [[ -n "$DIVISIONS" ]] && [[ "$DIVISIONS" == "test" ]]; then
#     PARTS=(test)
# else
#     # PARTS=(train dev test)
#     PARTS=(train dev)
# fi
# echo "Parts are : ${PARTS[*]}"

if [[ ! -n "$MODE" ]]; then
    MODE="MSPM"
fi
echo "Mode are : ${MODE}"

VOCAB_SIZE=32000

if [ ! -d $DATADIR ]; then
    mkdir $DATADIR
    hadoop fs -copyToLocal -f /home/byte_arnold_hl_mlnlc/user/wuxianze.0/Datasets/wmt14/cleaned $DATADIR/
fi
DATA="$DATADIR/cleaned"
TOKEN="$DATADIR/$MODE/$LG"
if [ ! -d "$TOKEN"  ] ; then
    mkdir -p "$TOKEN"
fi

if [[ "$MODE" == "SPM" ]]; then
  echo "Not supported"
  # if [ ! -f "$DATA/sentence.bpe.$VOCAB_SIZE.$LG.model" ] ; then
  #   echo "SPM training for Language..."
  #   if [[ "$LG" == "zh" ]] || [[ "$LG" == "ja" ]] ; then
  #     python fairseq/scripts/spm_train.py \
  #     --input="$DATA/lines.txt" \
  #     --model_prefix="$DATA/sentence.bpe.$VOCAB_SIZE.$LG" \
  #     --vocab_size=$VOCAB_SIZE \
  #     --character_coverage=0.995 \
  #     --model_type=bpe \
  #     --user_defined_symbols='<q>'
  #   else
  #     python fairseq/scripts/spm_train.py \
  #     --input="$DATA/lines.txt" \
  #     --model_prefix="$DATA/sentence.bpe.$VOCAB_SIZE.$LG" \
  #     --vocab_size=$VOCAB_SIZE \
  #     --character_coverage=1 \
  #     --model_type=bpe \
  #     --user_defined_symbols='<q>'
  #   fi
  # fi

  # echo "SPM encoding for dataset..."
  # for SPLIT in ${PARTS[*]}
  #   do
  #     echo "  encoding $DATA/$SPLIT.$LG.doc to $TOKEN/$SPLIT.$LG.spm.doc ..."
  #     python fairseq/scripts/spm_encode.py --model="$DATA/sentence.bpe.$VOCAB_SIZE.$LG.model" < "$DATA/$SPLIT.$LG.doc" | sed "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${LG_TAG}] [${TASK_TAG}] /" > "$TOKEN/$SPLIT.$LG.spm.doc"
  #     echo "  encoding $DATA/$SPLIT.$LG.sum to $TOKEN/$SPLIT.$LG.spm.sum ..."
  #     python fairseq/scripts/spm_encode.py --model="$DATA/sentence.bpe.$VOCAB_SIZE.$LG.model" < "$DATA/$SPLIT.$LG.sum" | sed "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${LG_TAG}] [${TASK_TAG}] /"> "$TOKEN/$SPLIT.$LG.spm.sum"
  #   done

  # echo "Changing vocab to dict..."
  # python -m preprocess.vocab2dict -i "$DATA/sentence.bpe.$VOCAB_SIZE.$LG.vocab" -o "$DATA/$LG.dict.txt" -l "$LG"
  # DICT="$DATA/$LG.dict.txt"


elif [[ "$MODE" == "MSPM" ]]; then
  MBART=/home/tiger/mbart.cc25
  if [ ! -d $MBART ]; then
    hadoop fs -copyToLocal hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/pretrained/mbart.CC25.tar.gz /home/tiger
    tar -xvzf /home/tiger/mbart.CC25.tar.gz -C /home/tiger
  fi
  MODEL=$MBART/sentence.bpe.model
  # DICT=$MBART/dict.txt
  # DICT=$MBART/dict_joint.txt
  DICT=$MBART/dict_extend.txt

  echo "MSPM encoding for dataset..."
  echo "  encoding $DATA/newstest2014-fren-src.en to $TOKEN/newstest2014-en2fr.en ..."
  python3 fairseq/scripts/spm_encode.py --model=$MODEL < "$DATA/newstest2014-fren-src.en" | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${LG_TAG1}] /" > "$TOKEN/newstest2014-en2fr.en"
  echo "first 2 line..."
  head -n 2 "$TOKEN/newstest2014-en2fr.en"

  echo "MSPM encoding for dataset..."
  echo "  encoding $DATA/newstest2014-fren-ref.fr to $TOKEN/newstest2014-en2fr.fr ..."
  python3 fairseq/scripts/spm_encode.py --model=$MODEL < "$DATA/newstest2014-fren-ref.fr" | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${LG_TAG2}] /" > "$TOKEN/newstest2014-en2fr.fr"
  echo "first 2 line..."
  head -n 2 "$TOKEN/newstest2014-en2fr.fr"


elif [[ "$MODE" == "BPE" ]]; then
  echo "Not supported"
  # wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
  # wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
  # wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'

  # DICT="dict.txt"
  # for SPLIT in ${PARTS[*]}
  # do
  #   for TYPE in doc sum
  #   do
  #     python -m examples.roberta.multiprocessing_bpe_encoder \
  #       --encoder-json encoder.json \
  #       --vocab-bpe vocab.bpe \
  #       --inputs "$DATA/$SPLIT.$LG.$TYPE" \
  #       --outputs "$TOKEN/$SPLIT.$LG.bpe.$TYPE" \
  #       --workers 60 \
  #       --keep-empty
  #   done
  # done

elif [[ "$MODE" == "PMNMT" ]]; then
  echo "Not supported"
  # # MODEL=toutiaomt/codes.bpe.32000
  # # DICT=toutiaomt/vocab.bpe.32000
  # LANGID="LANG_TOK_"$(echo "${LG}" | tr '[a-z]' '[A-Z]')
  # MODEL=~/mCOLT/vocab/codes.bpe.32000
  # DICT=~/mCOLT/vocab/dict.txt

  # echo "PMNMT BPE encoding for dataset..."
  # for SPLIT in ${PARTS[*]}
  #   do
  #     echo "  encoding $DATA/$SPLIT.$LG.doc to $TOKEN/$SPLIT.$LG.bpe.doc ..."
  #     python toutiaomt/text_processing/bpe.py --codes=$MODEL --input $DATA/$SPLIT.$LG.doc --output $TOKEN/$SPLIT.$LG.nonlang.bpe.doc --threads 32
  #     sed -e 's/^/'${LANGID}' /' -e "s/@@ <@@ q@@ >@@ /@@ <\/s> /g" -e "s/“@@/\"@@/g" -e "s/”@@/\"@@/g" $TOKEN/$SPLIT.$LG.nonlang.bpe.doc > $TOKEN/$SPLIT.$LG.bpe.doc
  #     echo "  encoding $DATA/$SPLIT.$LG.sum to $TOKEN/$SPLIT.$LG.bpe.sum ..."
  #     python toutiaomt/text_processing/bpe.py --codes=$MODEL --input $DATA/$SPLIT.$LG.sum --output $TOKEN/$SPLIT.$LG.nonlang.bpe.sum --threads 32
  #     sed -e 's/^/'${LANGID}' /' -e "s/@@ <@@ q@@ >@@ /@@ <\/s> /g" -e "s/“@@/\"@@/g" -e "s/”@@/\"@@/g" $TOKEN/$SPLIT.$LG.nonlang.bpe.sum > $TOKEN/$SPLIT.$LG.bpe.sum
  #   done
fi

echo "Generating data-bin for dataset..."
INPUT="$TOKEN"
BINDIR="$DATADIR/data-bin/$MODE/$LG"
if [[ "$MODE" == "SPM" ]] || [[ "$MODE" == "MSPM" ]]; then
  TOKENTYPE="spm"
else
  TOKENTYPE="bpe"
fi

echo "Binarized $INPUT ($TOKENTYPE) to $BINDIR with dict $DICT"

python3 fairseq/preprocess.py \
    --source-lang "en" \
    --target-lang "fr" \
    --testpref "$INPUT/newstest2014-en2fr"  \
    --destdir "$BINDIR" \
    --thresholdtgt 0 \
    --thresholdsrc 0 \
    --srcdict "$DICT" \
    --tgtdict "$DICT" \
    --workers 70
