#!/usr/bin/env bash

get_seeded_random()
{
  seed="$1"
  openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
    </dev/zero 2>/dev/null
}

base_dir=/opt/tiger/sumtest/multilingual

pwd=$(dirname $(readlink -f $0))
ccROOT=/home/tiger/cc100
denoiseDIR=/home/tiger/denoise
# ccDIR=/home/tiger/cc100/sample
summDIR=/home/tiger/summ/clean0702
DATAVER=En_Summ_EnZh_Unsup_Noise2_Doc_V2
DATADIR=/home/tiger/$DATAVER

if [ ! -e $denoiseDIR ]; then
    mkdir $denoiseDIR
fi

if [ ! -e $DATADIR ]; then
    mkdir $DATADIR
fi

tokenize(){   
    INPUT=$1
    OUTPUT=$2

    echo "tokenize ${INPUT} to ${OUTPUT} using mbart's spm..."

    # setup MBART
    MBART=/home/tiger/mbart.cc25
    if [ ! -d $MBART ]; then
        hadoop fs -copyToLocal hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/pretrained/mbart.CC25.tar.gz /home/tiger
        tar -xvzf /home/tiger/mbart.CC25.tar.gz -C /home/tiger
    fi
    MODEL=$MBART/sentence.bpe.model
    DICT=$MBART/dict_extend_extra2.txt
    if [ ! -e $DICT ]; then
        hadoop fs -copyToLocal hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/pretrained/dict_extend_extra2.txt $MBART
    fi
    python3 ${base_dir}/fairseq/scripts/spm_encode.py --model=$MODEL < $INPUT > $OUTPUT

    echo "first line of ${OUTPUT}"
    head -n 1 $OUTPUT
}

sample ()
{
    input=$1
    output=$2
    s=$3
    e=$4

    echo "sample line $s to line $e from $input"
    echo "results will be saved to $output"
    sed -n "${s},${e}p" $input > $output
}

addPrefix ()
{
    input=$1
    output=$2
    prefix=$3

    cat $input | sed -e "s/<q>/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/$prefix /" > $output
}

lg=zh
for split in train dev
do
    cat $summDIR/$lg/$split.$lg.doc > $denoiseDIR/${split}.${lg}.tgt
done

echo "Tokenizing summarization data..."

# # tokenize summ data
lg=zh
TOKEN="$DATADIR/MSPM/summ/$lg"
if [ ! -e $TOKEN ]; then
    mkdir -p $TOKEN
fi
for SPLIT in train dev test
do
    tokenize $summDIR/$lg/$SPLIT.$lg.doc $TOKEN/$SPLIT.$lg.spm.src
    tokenize $summDIR/$lg/$SPLIT.$lg.sum $TOKEN/$SPLIT.$lg.spm.tgt
done

# # add prefix tokens
for task in summ
do
    for lg_tag in zh_CN
    do
        tags=(${lg_tag//_/ })
        lg=${tags[0]}
        INDIR="$DATADIR/MSPM/${task}/${lg}"
        if [ ! -e $INDIR ]; then
            echo "[Adding prefix token] the directory ${INDIR} does not exist"
        else
            for field in src tgt
            do
                addPrefix ${INDIR}/train.${lg}.spm.${field} ${INDIR}/train.${lg}.spm.prefix.${field} "[${lg_tag}] [${task}]"
                addPrefix ${INDIR}/dev.${lg}.spm.${field} ${INDIR}/dev.${lg}.spm.prefix.${field} "[${lg_tag}] [${task}]"
                if [ "$task" == "summ" ]; then
                    addPrefix ${INDIR}/test.${lg}.spm.${field} ${INDIR}/test.${lg}.spm.prefix.${field} "[${lg_tag}] [${task}]"
                fi
            done
        fi
    done
done

DIR=$DATADIR/MSPM
lengthDIR=$DATADIR/MSPM/filtered

cp $DIR/summ/zh/test.zh.spm.prefix.src $DIR/testZhSumm.noshuffle.spm.src
cp $DIR/summ/zh/test.zh.spm.prefix.tgt $DIR/testZhSumm.noshuffle.spm.tgt

for field in testZhSumm
do
    python3 ../filterByMaxSize.py $DIR/${field}.noshuffle.spm.src $DIR/${field}.noshuffle.spm.tgt $lengthDIR 1010
    shuf --random-source=<(get_seeded_random 66) $DIR/${field}.noshuffle.spm.src > $DIR/${field}.spm.doc
    shuf --random-source=<(get_seeded_random 66) $DIR/${field}.noshuffle.spm.tgt > $DIR/${field}.spm.sum
done

echo "Binary data ..."
BINDIR=$DATADIR/data-bin
bash binary.sh \
    --testpref "$DIR/testEnSumm.spm,$DIR/testZhSumm.spm" \
    --destdir $BINDIR

# bash binary.sh \
#     --testpref "$DIR/testZhSumm.spm" \
#     --destdir $BINDIR
