#!/usr/bin/env bash
set -e

# bash xqg_predata_trans_mono.sh /mnt/bd/lab-wxz/clt/xqg/raw MSPM

cd /opt/tiger/sumtest/crossLingualTransfer

DATADIR="$1"
MODE="$2"
DIVISIONS="$3"

MERGED_NAME="en_zh_mono_retran"
SUMM_LANG="en"
TRAN_LANG="zh"
TRANDIR="/mnt/bd/lab-wxz/clt/xqg/translated"

declare -A LANG_TAGS
LANG_TAGS=([en]="en_XX" [fr]="fr_XX" [zh]="zh_CN")

if [ -z "$DATADIR" ]; then
    echo "Lose the raw data dir!"
    exit
fi
if [[ -n "$DIVISIONS" ]] && [[ "$DIVISIONS" == "test" ]]; then
    PARTS=(test)
else
    PARTS=(train)
fi
echo "Parts are : ${PARTS[*]}"

TOKEN="$DATADIR/$MODE/${MERGED_NAME}"
if [ ! -d "$TOKEN"  ] ; then
    mkdir -p "$TOKEN"
fi

if [[ "$MODE" == "MSPM" ]]; then
    MBART=/home/tiger/mbart.cc25.v2
    MODEL=$MBART/sentence.bpe.model
    DICT=$MBART/dict_extend.txt

    echo "MSPM encoding for dataset..."
    for SPLIT in ${PARTS[*]}
    do
        echo $TOKEN/$SPLIT.${MERGED_NAME}.spm.doc
        if [ -f "$TOKEN/$SPLIT.${MERGED_NAME}.spm.doc" ]; then
            rm $TOKEN/$SPLIT.${MERGED_NAME}.spm.*
        fi

        TRANTOKEN="$TRANDIR/$MODE/${MERGED_NAME}"
        if [ ! -d "$TRANTOKEN"  ] ; then
            mkdir -p "$TRANTOKEN"
        fi

        for field in q a e
        do
            LG="${SUMM_LANG}"
            echo "  encoding $DATADIR/$SPLIT.$field.$LG.lc to $TOKEN/$SPLIT.$LG.spm.$field ..."
            python3 fairseq/scripts/spm_encode.py --model=$MODEL < "$DATADIR/$SPLIT.$field.$LG.lc" > "$TOKEN/$SPLIT.$LG.spm.$field"

            LG="${TRAN_LANG}"
            echo "  encoding $TRANDIR/$SPLIT.$field.en2.$LG.lc.noempty to $TRANTOKEN/$SPLIT.$LG.spm.$field ..."
            python3 fairseq/scripts/spm_encode.py --model=$MODEL < "$TRANDIR/$SPLIT.$field.en2.$LG.lc.noempty" > "$TRANTOKEN/$SPLIT.$LG.spm.$field"
        done

        LG="${SUMM_LANG}"
        TAG=${LANG_TAGS[$LG]}
        paste -d " " $TOKEN/$SPLIT.$LG.spm.e $TOKEN/$SPLIT.$LG.spm.a | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${TAG}] /" >> "$TOKEN/$SPLIT.${MERGED_NAME}.spm.doc"
        cat $TOKEN/$SPLIT.$LG.spm.q | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${TAG}] /" >> "$TOKEN/$SPLIT.${MERGED_NAME}.spm.sum"

        LG="${TRAN_LANG}"
        TAG=${LANG_TAGS[$LG]}
        paste -d " " $TRANTOKEN/$SPLIT.$LG.spm.e $TRANTOKEN/$SPLIT.$LG.spm.a | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${TAG}] /" >> "$TOKEN/$SPLIT.${MERGED_NAME}.spm.doc"
        cat $TRANTOKEN/$SPLIT.$LG.spm.q | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${TAG}] /" >> "$TOKEN/$SPLIT.${MERGED_NAME}.spm.sum"
    done
fi

echo "Generating data-bin for dataset..."
INPUT="$TOKEN"
BINDIR="$DATADIR/data-bin/$MODE/${MERGED_NAME}"
if [[ "$MODE" == "SPM" ]] || [[ "$MODE" == "MSPM" ]]; then
    TOKENTYPE="spm"
else
    TOKENTYPE="bpe"
fi

echo "Binarized $INPUT ($TOKENTYPE) to $BINDIR with dict $DICT"

if [ ${#PARTS[*]} == 1 ]; then
    python3 fairseq/preprocess.py \
        --source-lang doc \
        --target-lang sum \
        --trainpref "$INPUT/train.${MERGED_NAME}.$TOKENTYPE" \
        --destdir "$BINDIR" \
        --srcdict "$DICT" \
        --tgtdict "$DICT" \
        --workers 30
else
    python3 fairseq/preprocess.py \
        --source-lang doc \
        --target-lang sum \
        --testpref "$INPUT/test.${MERGED_NAME}.$TOKENTYPE"  \
        --destdir "$BINDIR" \
        --thresholdtgt 0 \
        --thresholdsrc 0 \
        --srcdict "$DICT" \
        --tgtdict "$DICT" \
        --workers 30
fi