#!/usr/bin/env bash
set -e

# bash predata_nlpcc.sh /home/tiger/nlpcc MSPM
# 在source和target的前面加lang tag

DATADIR="$1"
MODE="$2"
DIVISIONS="$3"

DATAVER="clean_lang_tag"

cd "/opt/tiger/sumtest/multilingual"

if [ -z "$DATADIR" ]; then
    echo "Lose the raw data dir!"
    exit
fi
if [[ -n "$DIVISIONS" ]] && [[ "$DIVISIONS" == "test" ]]; then
    PARTS=(test)
else
    PARTS=(train dev test)
fi
echo "Parts are : ${PARTS[*]}"

VOCAB_SIZE=32000
DATA="$DATADIR/clean"
TOKEN="$DATADIR/$DATAVER/$MODE"

if [ ! -d "$TOKEN"  ] ; then
    mkdir -p "$TOKEN"
fi

# if [[ "$MODE" == "SPM" ]]; then
#   if [ ! -f "$DATA/sentence.bpe.$VOCAB_SIZE.$LG.model" ] ; then
#     echo "SPM training for Language..."
#     if [[ "$LG" == "zh" ]] || [[ "$LG" == "ja" ]] ; then
#       python3 fairseq/scripts/spm_train.py \
#       --input="$DATA/lines.txt" \
#       --model_prefix="$DATA/sentence.bpe.$VOCAB_SIZE.$LG" \
#       --vocab_size=$VOCAB_SIZE \
#       --character_coverage=0.995 \
#       --model_type=bpe \
#       --user_defined_symbols='<q>'
#     else
#       python3 fairseq/scripts/spm_train.py \
#       --input="$DATA/lines.txt" \
#       --model_prefix="$DATA/sentence.bpe.$VOCAB_SIZE.$LG" \
#       --vocab_size=$VOCAB_SIZE \
#       --character_coverage=1 \
#       --model_type=bpe \
#       --user_defined_symbols='<q>'
#     fi
#   fi

#   echo "SPM encoding for dataset..."
#   for SPLIT in ${PARTS[*]}
#     do
#       echo "  encoding $DATA/$SPLIT.$LG.doc to $TOKEN/$SPLIT.$LG.spm.doc ..."
#       python3 fairseq/scripts/spm_encode.py --model="$DATA/sentence.bpe.$VOCAB_SIZE.$LG.model" < "$DATA/$SPLIT.$LG.doc" | sed "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${TAG}] /" > "$TOKEN/$SPLIT.$LG.spm.doc"
#       echo "  encoding $DATA/$SPLIT.$LG.sum to $TOKEN/$SPLIT.$LG.spm.sum ..."
#       python3 fairseq/scripts/spm_encode.py --model="$DATA/sentence.bpe.$VOCAB_SIZE.$LG.model" < "$DATA/$SPLIT.$LG.sum" | sed "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${TAG}] /"> "$TOKEN/$SPLIT.$LG.spm.sum"
#     done

#   echo "Changing vocab to dict..."
#   python3 -m preprocess.vocab2dict -i "$DATA/sentence.bpe.$VOCAB_SIZE.$LG.vocab" -o "$DATA/$LG.dict.txt" -l "$LG"
#   DICT="$DATA/$LG.dict.txt"


if [[ "$MODE" == "MSPM" ]]; then
  pretrained_path=hdfs://haruna/home/byte_arnold_lq_mlnlc/user/wangdanqing.122/Workshop/MultiLingual/pretrained
  MBART=/home/tiger/mbart.cc25
  DICT=$MBART/dict_extend.txt
  MODEL=$MBART/mbart.cc25/sentence.bpe.model
  if [ ! -e ${MBART} ]; then
    mkdir -p ${MBART}
    echo "Load pretrained model from ${pretrained_path}/mbart.CC25.tar.gz to ${MBART}" >&2
    hadoop fs -copyToLocal ${pretrained_path}/mbart.CC25.tar.gz ${MBART}
    tar -xvzf ${MBART}/mbart.CC25.tar.gz -C ${MBART}

    if [ ! -e ${DICT} ]; then
      echo "Load dict from ${pretrained_path}/dict_extend.txt to ${DICT}" >&2
      hadoop fs -copyToLocal ${pretrained_path}/dict_extend.txt ${DICT}
    fi
  fi

  echo "MSPM encoding for dataset..."
  TAG=zh_CN
  for SPLIT in ${PARTS[*]}
    do
      echo "  encoding $DATA/$SPLIT.doc to $TOKEN/$SPLIT.spm.doc ..."
      python3 fairseq/scripts/spm_encode.py --model=$MODEL < "$DATA/$SPLIT.doc" | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/"  -e "s/^/[${TAG}] /"  > "$TOKEN/$SPLIT.spm.doc"
      echo "  encoding $DATA/$SPLIT.sum to $TOKEN/$SPLIT.spm.sum ..."
      python3 fairseq/scripts/spm_encode.py --model=$MODEL < "$DATA/$SPLIT.sum" | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/"  -e "s/^/[${TAG}] /"   > "$TOKEN/$SPLIT.spm.sum"
    done
fi


# elif [[ "$MODE" == "BPE" ]]; then
#   wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
#   wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
#   wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'

#   DICT="dict.txt"
#   for SPLIT in ${PARTS[*]}
#   do
#     for TYPE in doc sum
#     do
#       python3 -m examples.roberta.multiprocessing_bpe_encoder \
#         --encoder-json encoder.json \
#         --vocab-bpe vocab.bpe \
#         --inputs "$DATA/$SPLIT.$LG.$TYPE" \
#         --outputs "$TOKEN/$SPLIT.$LG.bpe.$TYPE" \
#         --workers 60 \
#         --keep-empty
#     done
#   done

# elif [[ "$MODE" == "PMNMT" ]]; then
#   # MODEL=toutiaomt/codes.bpe.32000
#   # DICT=toutiaomt/vocab.bpe.32000
#   LANGID="LANG_TOK_"$(echo "${LG}" | tr '[a-z]' '[A-Z]')
#   MODEL=~/mCOLT/vocab/codes.bpe.32000
#   DICT=~/mCOLT/vocab/dict.txt

#   echo "PMNMT BPE encoding for dataset..."
#   for SPLIT in ${PARTS[*]}
#     do
#       echo "  encoding $DATA/$SPLIT.$LG.doc to $TOKEN/$SPLIT.$LG.bpe.doc ..."
#       python3 toutiaomt/text_processing/bpe.py --codes=$MODEL --input $DATA/$SPLIT.$LG.doc --output $TOKEN/$SPLIT.$LG.nonlang.bpe.doc --threads 32
#       sed -e 's/^/'${LANGID}' /' -e "s/@@ <@@ q@@ >@@ /@@ <\/s> /g" -e "s/“@@/\"@@/g" -e "s/”@@/\"@@/g" $TOKEN/$SPLIT.$LG.nonlang.bpe.doc > $TOKEN/$SPLIT.$LG.bpe.doc
#       echo "  encoding $DATA/$SPLIT.$LG.sum to $TOKEN/$SPLIT.$LG.bpe.sum ..."
#       python3 toutiaomt/text_processing/bpe.py --codes=$MODEL --input $DATA/$SPLIT.$LG.sum --output $TOKEN/$SPLIT.$LG.nonlang.bpe.sum --threads 32
#       sed -e 's/^/'${LANGID}' /' -e "s/@@ <@@ q@@ >@@ /@@ <\/s> /g" -e "s/“@@/\"@@/g" -e "s/”@@/\"@@/g" $TOKEN/$SPLIT.$LG.nonlang.bpe.sum > $TOKEN/$SPLIT.$LG.bpe.sum
#     done

# fi


echo "Generating data-bin for dataset..."
INPUT="$TOKEN"
# BINDIR="$DATADIR/$DATAVER/data-bin/$MODE"
BINDIR="$DATADIR/$DATAVER/data-bin/$MODE"
if [[ "$MODE" == "SPM" ]] || [[ "$MODE" == "MSPM" ]]; then
  TOKENTYPE="spm"
else
  TOKENTYPE="bpe"
fi

echo "Binarized $INPUT ($TOKENTYPE) to $BINDIR with dict $DICT"

if [ ${#PARTS[*]} == 3 ]; then
  python3 fairseq/preprocess.py \
  --source-lang doc \
  --target-lang sum \
  --trainpref "$INPUT/train.$TOKENTYPE" \
  --validpref "$INPUT/dev.$TOKENTYPE" \
  --testpref "$INPUT/test.$TOKENTYPE"  \
  --destdir "$BINDIR" \
  --srcdict "$DICT" \
  --tgtdict "$DICT" \
  --workers 70
else
  python3 fairseq/preprocess.py \
    --source-lang doc \
    --target-lang sum \
    --testpref "$INPUT/test.$TOKENTYPE"  \
    --destdir "$BINDIR" \
    --thresholdtgt 0 \
    --thresholdsrc 0 \
    --srcdict "$DICT" \
    --tgtdict "$DICT" \
    --workers 70
fi
