#!/usr/bin/env bash

get_seeded_random()
{
  seed="$1"
  openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
    </dev/zero 2>/dev/null
}

base_dir=/opt/tiger/sumtest/multilingual

pwd=$(dirname $(readlink -f $0))
ccROOT=/home/tiger/cc100
denoiseDIR=/home/tiger/denoise
summDIR=/home/tiger/WikiLingua
DATAVER=mspm4_En_Summ_ZhJa_pseudo
DATADIR=/home/tiger/WikiLingua/$DATAVER

if [ ! -e $denoiseDIR ]; then
    mkdir $denoiseDIR
fi

if [ ! -e $DATADIR ]; then
    mkdir $DATADIR
fi

tokenize(){   
    INPUT=$1
    OUTPUT=$2

    echo "tokenize ${INPUT} to ${OUTPUT} using mbart's spm..."

    # setup MBART
    MBART=/home/tiger/mbart.cc25
    if [ ! -d $MBART ]; then
        hadoop fs -copyToLocal hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/pretrained/mbart.CC25.tar.gz /home/tiger
        tar -xvzf /home/tiger/mbart.CC25.tar.gz -C /home/tiger
    fi
    MODEL=$MBART/sentence.bpe.model
    DICT=$MBART/dict_extend_extra2.txt
    if [ ! -e $DICT ]; then
        hadoop fs -copyToLocal hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/pretrained/dict_extend_extra2.txt $MBART
    fi
    python3 ${base_dir}/fairseq/scripts/spm_encode.py --model=$MODEL < $INPUT > $OUTPUT

    echo "first line of ${OUTPUT}"
    head -n 1 $OUTPUT
}

sample ()
{
    input=$1
    output=$2
    s=$3
    e=$4

    echo "sample line $s to line $e from $input"
    echo "results will be saved to $output"
    sed -n "${s},${e}p" $input > $output
}

addPrefix ()
{
    input=$1
    output=$2
    prefix=$3

    cat $input | sed -e "s/<q>/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/$prefix /" > $output
}

# # # sample denoise data
# echo "Sampling pseudo summ data..."
# for lg in zh ja
# do
#     for split in train dev
#     do
#         if [ "$split" == "train" ]; then
#             head -n 9800000 $ccROOT/${lg}.first10m.txt > $denoiseDIR/${split}.${lg}.sent
#         else
#             head -n 10000000 $ccROOT/${lg}.first10m.txt | tail -n 200000 > $denoiseDIR/${split}.${lg}.sent
#         fi
#         python3 ../../mergeDoc.py -i $denoiseDIR/${split}.${lg}.sent -o $denoiseDIR/${split}.${lg} -d "<q>" -m mergeDoc
#         python3 ../wikiCreateData.py -i $denoiseDIR/${split}.${lg} -os $denoiseDIR/${split}.${lg}.src -ot $denoiseDIR/${split}.${lg}.tgt \
#             -m getPseudoSumm -t 40 -d '<q>'
#     done
# done


# # # tokenize summ data
# lg=en
# TOKEN="$DATADIR/MSPM/summ/$lg"
# if [ ! -e $TOKEN ]; then
#     mkdir -p $TOKEN
# fi
# for SPLIT in train dev test
# do
#     tokenize $summDIR/$lg/$SPLIT.$lg.doc $TOKEN/$SPLIT.$lg.spm.src
#     tokenize $summDIR/$lg/$SPLIT.$lg.sum $TOKEN/$SPLIT.$lg.spm.tgt
# done

# # # tokenize denoise data
# for lg in ja zh
# do
#     TOKEN="$DATADIR/MSPM/summ/${lg}"
#     if [ ! -e $TOKEN ]; then
#         mkdir -p $TOKEN
#     fi
#     for split in train dev
#     do
#         input=$denoiseDIR/$split.$lg.tgt
#         output=$TOKEN/$split.$lg.spm.tgt
#         tokenize $input $output

#         input=$denoiseDIR/$split.$lg.src
#         output=$TOKEN/$split.$lg.spm.src
#         tokenize $input $output
#     done
# done

# # # add prefix tokens
# for task in summ
# do
#     for lg_tag in en_XX zh_CN ja_XX
#     do
#         tags=(${lg_tag//_/ })
#         lg=${tags[0]}
#         INDIR="$DATADIR/MSPM/${task}/${lg}"
#         if [ ! -e $INDIR ]; then
#             echo "[Adding prefix token] the directory ${INDIR} does not exist"
#         else
#             for field in src tgt
#             do
#                 addPrefix ${INDIR}/train.${lg}.spm.${field} ${INDIR}/train.${lg}.spm.prefix.${field} "[${lg_tag}]"
#                 addPrefix ${INDIR}/dev.${lg}.spm.${field} ${INDIR}/dev.${lg}.spm.prefix.${field} "[${lg_tag}]"
#                 if [ "$task" == "summ" ]; then
#                     addPrefix ${INDIR}/test.${lg}.spm.${field} ${INDIR}/test.${lg}.spm.prefix.${field} "[${lg_tag}]"
#                 fi
#             done
#         fi
#     done
# done

# concat denoising data with summ data
DIR=$DATADIR/MSPM
lengthDIR=$DATADIR/MSPM/filtered

# head -n 100000 $DIR/summ/zh/train.zh.spm.prefix.src > $DIR/summ/zh/train.zh.spm.prefix.src.tmp
# head -n 100000 $DIR/summ/zh/train.zh.spm.prefix.tgt > $DIR/summ/zh/train.zh.spm.prefix.tgt.tmp

# head -n 100000 $DIR/summ/ja/train.ja.spm.prefix.src > $DIR/summ/ja/train.ja.spm.prefix.src.tmp
# head -n 100000 $DIR/summ/ja/train.ja.spm.prefix.tgt > $DIR/summ/ja/train.ja.spm.prefix.tgt.tmp

# cat $DIR/summ/en/train.en.spm.prefix.src $DIR/summ/zh/train.zh.spm.prefix.src.tmp $DIR/summ/ja/train.ja.spm.prefix.src.tmp > $DIR/trainJoint.noshuffle.spm.src
# cat $DIR/summ/en/train.en.spm.prefix.tgt $DIR/summ/zh/train.zh.spm.prefix.tgt.tmp $DIR/summ/ja/train.ja.spm.prefix.tgt.tmp > $DIR/trainJoint.noshuffle.spm.tgt

rm $DIR/dev*.src
rm $DIR/dev*.tgt
rm $DIR/test*.src
rm $DIR/test*.tgt

for lg in en zh ja
do
    cp $DIR/summ/${lg}/dev.${lg}.spm.prefix.src $DIR/dev${lg}Summ.noshuffle.spm.src
    cp $DIR/summ/${lg}/dev.${lg}.spm.prefix.tgt $DIR/dev${lg}Summ.noshuffle.spm.tgt
done

cp $DIR/summ/en/test.en.spm.prefix.src $DIR/testenSumm.noshuffle.spm.src
cp $DIR/summ/en/test.en.spm.prefix.tgt $DIR/testenSumm.noshuffle.spm.tgt

for field in trainJoint devenSumm devzhSumm devjaSumm testenSumm
do
    python3 ../../filterByMaxSize.py $DIR/${field}.noshuffle.spm.src $DIR/${field}.noshuffle.spm.tgt $lengthDIR 1010
    shuf --random-source=<(get_seeded_random 66) $lengthDIR/${field}.noshuffle.spm.src > $DIR/${field}.spm.doc
    shuf --random-source=<(get_seeded_random 66) $lengthDIR/${field}.noshuffle.spm.tgt > $DIR/${field}.spm.sum
done

echo "Binary data ..."
BINDIR=$DATADIR/data-bin
bash binary.sh \
    --trainpref $DIR/trainJoint.spm \
    --validpref "$DIR/devenSumm.spm,$DIR/devzhSumm.spm,$DIR/devjaSumm.spm" \
    --testpref "$DIR/testenSumm.spm" \
    --destdir $BINDIR
