#!/usr/bin/env bash

# tokenize and remove instances whose length of source or target sequences exceed ${MAXLEN}

# bash tokenize_and_filter.sh /home/tiger/wmt14 en en_XX MSPM 1020

set -e
SCRIPTDIR=/opt/tiger/sumtest/multilingual/utils
cd /opt/tiger/sumtest/multilingual

DATADIR="$1"
LG="$2"
LG_TAG="$3"
MODE="$4"
MAXLEN="$5"

SPLITS=(train dev)

if [ -z "$DATADIR" ]; then
    echo "Lose the raw data dir!"
    exit
fi
if [ -z "$LG" ]; then
    echo "Lose the language!"
    exit
fi

if [[ ! -n "$MODE" ]]; then
    MODE="MSPM"
fi
echo "Mode are : ${MODE}"

VOCAB_SIZE=32000

if [ ! -d $DATADIR ]; then
    mkdir $DATADIR
    hadoop fs -copyToLocal -f /home/byte_arnold_hl_mlnlc/user/wuxianze.0/Datasets/wmt14/cleaned $DATADIR/
fi
DATA="$DATADIR/${LG}"
TOKEN="$DATADIR/$MODE/${LG}"
if [ ! -d "$TOKEN"  ] ; then
    mkdir -p "$TOKEN"
fi

if [[ "$MODE" == "SPM" ]]; then
  echo "Not supported"

elif [[ "$MODE" == "MSPM" ]]; then
  MBART=/home/tiger/mbart.cc25
  if [ ! -d $MBART ]; then
    hadoop fs -copyToLocal hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/pretrained/mbart.CC25.tar.gz /home/tiger
    tar -xvzf /home/tiger/mbart.CC25.tar.gz -C /home/tiger
  fi
  MODEL=$MBART/sentence.bpe.model

  echo "MSPM encoding for dataset..."
  for split in ${SPLITS[*]}
  do
    if [ ! -e $TOKEN/unclean ]; then
      mkdir -p $TOKEN/unclean
    fi
    INPUT=$DATA/$split.$LG.doc
    OUTPUT=$TOKEN/unclean/$split.$LG.spm.doc
    echo "  encoding $INPUT to $OUTPUT ..."
    python3 fairseq/scripts/spm_encode.py --model=$MODEL \
        < "$INPUT" | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${LG_TAG}] /" -e "s/\[ mas k \]/[mask] /g" -e "s/▁\[mask\]/[mask]/g" \
        > "$OUTPUT"
    echo "first 2 line..."
    head -n 2 "$OUTPUT"

    INPUT=$DATA/$split.$LG.sum
    OUTPUT=$TOKEN/unclean/$split.$LG.spm.sum
    echo "  encoding $INPUT to $OUTPUT ..."
    python3 fairseq/scripts/spm_encode.py --model=$MODEL \
        < "$INPUT" | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${LG_TAG}] /" -e "s/\[ mas k \] /[mask] /g" -e "s/▁\[mask\]/[mask] /g" \
        > "$OUTPUT"
    echo "first 2 line..."
    head -n 2 "$OUTPUT"

    echo "remove long sequence, MAXLEN=1020 ... "
    python3 ${SCRIPTDIR}/filterByMaxSize.py $TOKEN/unclean/$split.$LG.spm.doc $TOKEN/unclean/$split.$LG.spm.sum $TOKEN 1020
  done

elif [[ "$MODE" == "BPE" ]]; then
  echo "Not supported"

elif [[ "$MODE" == "PMNMT" ]]; then
  echo "Not supported"
fi
