#!/usr/bin/env bash
set -e

# bash tokenizeSumm.sh /home/tiger/summ en en_XX

cd ../..

DATADIR="$1"
LG="$2"
LG_TAG="$3"
MODE="$4"
DIVISIONS="$5"

TASK_TAG="summ"

if [ -z "$DATADIR" ]; then
    echo "Lose the raw data dir!"
    exit
fi
if [ -z "$LG" ]; then
    echo "Lose the language!"
    exit
fi
if [[ -n "$DIVISIONS" ]] && [[ "$DIVISIONS" == "test" ]]; then
    PARTS=(test)
else
    PARTS=(train dev test)
fi
echo "Parts are : ${PARTS[*]}"

if [[ ! -n "$MODE" ]]; then
    MODE="MSPM"
fi
echo "Mode are : ${MODE}"

if [ ! -e "$DATADIR/clean0702" ]; then
    hadoop_data=hdfs://haruna/home/byte_arnold_lq_mlnlc/user/wangdanqing.122/Datasets/multilingual/clean0702
    echo "download summarization dataset from $hadoop_data to ${DATADIR}/"
    hdfs dfs -get ${hadoop_data} ${DATADIR}/
fi

VOCAB_SIZE=32000
DATA="$DATADIR/clean0702/$LG"
TOKEN="$DATADIR/$MODE/$LG"
if [ ! -d "$TOKEN"  ] ; then
    mkdir -p "$TOKEN"
fi

if [[ "$MODE" == "SPM" ]]; then
  echo "Not supported"


elif [[ "$MODE" == "MSPM" ]]; then
    MBART=/home/tiger/mbart.cc25
    if [ ! -d $MBART ]; then
        hadoop fs -copyToLocal hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/pretrained/mbart.CC25.tar.gz /home/tiger
        tar -xvzf /home/tiger/mbart.CC25.tar.gz -C /home/tiger
    fi
    MODEL=$MBART/sentence.bpe.model
    DICT=$MBART/dict_extend_mask.txt

    echo "MSPM encoding for dataset..."
    for SPLIT in ${PARTS[*]}
    do
        input=$DATA/$SPLIT.$LG.doc
        output=$TOKEN/$SPLIT.$LG.spm.doc
        echo " encoding $input to $output ..."
        python3 fairseq/scripts/spm_encode.py --model=$MODEL \
            < "$input" | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${LG_TAG}] [$TASK_TAG] /" > "$output"
        echo "first line..."
        head -n 1 $output

        input=$DATA/$SPLIT.$LG.sum
        output=$TOKEN/$SPLIT.$LG.spm.sum
        echo "  encoding $input to $output ..."
        python3 fairseq/scripts/spm_encode.py --model=$MODEL < \
            "$input" | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${LG_TAG}] [$TASK_TAG] /" > "$output"
        echo "first line..."
        head -n 1 "$output"
    done


elif [[ "$MODE" == "BPE" ]]; then
  echo "Not supported"

elif [[ "$MODE" == "PMNMT" ]]; then
  echo "Not supported"
fi

