#!/usr/bin/env bash

get_seeded_random()
{
  seed="$1"
  openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
    </dev/zero 2>/dev/null
}

base_dir=/opt/tiger/sumtest/multilingual

pwd=$(dirname $(readlink -f $0))
ccROOT=/home/tiger/cc100
denoiseDIR=/home/tiger/denoise
# ccDIR=/home/tiger/cc100/sample
summDIR=/home/tiger/summ/clean0702
DATAVER=En_Summ_EnZh_Unsup_Noise2_Doc_V2
DATADIR=/home/tiger/$DATAVER

if [ ! -e $denoiseDIR ]; then
    mkdir $denoiseDIR
fi

if [ ! -e $DATADIR ]; then
    mkdir $DATADIR
fi

tokenize(){   
    INPUT=$1
    OUTPUT=$2

    echo "tokenize ${INPUT} to ${OUTPUT} using mbart's spm..."

    # setup MBART
    MBART=/home/tiger/mbart.cc25
    if [ ! -d $MBART ]; then
        hadoop fs -copyToLocal hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/pretrained/mbart.CC25.tar.gz /home/tiger
        tar -xvzf /home/tiger/mbart.CC25.tar.gz -C /home/tiger
    fi
    MODEL=$MBART/sentence.bpe.model
    DICT=$MBART/dict_extend_extra2.txt
    if [ ! -e $DICT ]; then
        hadoop fs -copyToLocal hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/pretrained/dict_extend_extra2.txt $MBART
    fi
    python3 ${base_dir}/fairseq/scripts/spm_encode.py --model=$MODEL < $INPUT > $OUTPUT

    echo "first line of ${OUTPUT}"
    head -n 1 $OUTPUT
}

sample ()
{
    input=$1
    output=$2
    s=$3
    e=$4

    echo "sample line $s to line $e from $input"
    echo "results will be saved to $output"
    sed -n "${s},${e}p" $input > $output
}

addPrefix ()
{
    input=$1
    output=$2
    prefix=$3

    cat $input | sed -e "s/<q>/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/$prefix /" > $output
}

# # sample denoise data
echo "Sampling denoising data..."
lg=zh
for split in train dev
do
    if [ "$split" == "train" ]; then
        head -n 1980000 $ccROOT/${lg}.first10m.txt > $denoiseDIR/${split}.${lg}.sent.tgt
    else
        # head -n 2 $ccROOT/${lg}.first50m.txt > $denoiseDIR/${split}.${lg}.sent.tgt
        head -n 2000000 $ccROOT/${lg}.first10m.txt | tail -n 20000 > $denoiseDIR/${split}.${lg}.sent.tgt
    fi

    python3 ../mergeDoc.py -m mergeSent -i $denoiseDIR/${split}.${lg}.sent.tgt -o $denoiseDIR/${split}.${lg}.merged.tgt -d "<q>"
    python3 ../mergeDoc.py -m mergeDoc -i $denoiseDIR/${split}.${lg}.merged.tgt -o $denoiseDIR/${split}.${lg}.tgt -d "<q>"
done

lg=en
for split in train dev
do
    # head -n 2 $summDIR/$lg/$split.$lg.sum > $denoiseDIR/${split}.${lg}.tgt
    head -n 50000 $summDIR/$lg/$split.$lg.doc > $denoiseDIR/${split}.${lg}.tgt
done

echo "Tokenizing summarization data..."

# # tokenize summ data
lg=en
TOKEN="$DATADIR/MSPM/summ/$lg"
if [ ! -e $TOKEN ]; then
    mkdir -p $TOKEN
fi
for SPLIT in train dev test
do
    # head -n 2 $summDIR/$lg/$SPLIT.$lg.doc > $summDIR/$lg/$SPLIT.$lg.doc.first2
    # tokenize $summDIR/$lg/$SPLIT.$lg.doc.first2 $TOKEN/$SPLIT.$lg.spm.src

    # head -n 2 $summDIR/$lg/$SPLIT.$lg.sum > $summDIR/$lg/$SPLIT.$lg.sum.first2
    # tokenize $summDIR/$lg/$SPLIT.$lg.sum.first2 $TOKEN/$SPLIT.$lg.spm.tgt

    tokenize $summDIR/$lg/$SPLIT.$lg.doc $TOKEN/$SPLIT.$lg.spm.src
    tokenize $summDIR/$lg/$SPLIT.$lg.sum $TOKEN/$SPLIT.$lg.spm.tgt
done

# # tokenize denoise data
for lg in en zh
do
    TOKEN="$DATADIR/MSPM/denoise/${lg}"
    if [ ! -e $TOKEN ]; then
        mkdir -p $TOKEN
    fi
    for split in train dev
    do
        input=$denoiseDIR/$split.$lg.tgt
        output=$TOKEN/$split.$lg.spm.tgt
        tokenize $input $output
    done
done

# add noise to denoising data
for split in train dev
do
    for lgtag in zh_CN en_XX
    do
        tags=(${lgtag//_/ })
        lg=${tags[0]}

        INDIR="$DATADIR/MSPM/denoise/${lg}"
        echo "[${split}] add noise to unsupervised ${lg} ..."
        python3 ../addNoise.py -m noiseV2 -i ${INDIR}/$split.${lg}.spm.tgt -o ${INDIR}/$split.${lg}.spm.src -l ${lg}
    done
done

# # add prefix tokens
for task in denoise summ
do
    for lg_tag in zh_CN en_XX
    do
        tags=(${lg_tag//_/ })
        lg=${tags[0]}
        INDIR="$DATADIR/MSPM/${task}/${lg}"
        if [ ! -e $INDIR ]; then
            echo "[Adding prefix token] the directory ${INDIR} does not exist"
        else
            for field in src tgt
            do
                addPrefix ${INDIR}/train.${lg}.spm.${field} ${INDIR}/train.${lg}.spm.prefix.${field} "[${lg_tag}] [${task}]"
                addPrefix ${INDIR}/dev.${lg}.spm.${field} ${INDIR}/dev.${lg}.spm.prefix.${field} "[${lg_tag}] [${task}]"
                if [ "$task" == "summ" ]; then
                    addPrefix ${INDIR}/test.${lg}.spm.${field} ${INDIR}/test.${lg}.spm.prefix.${field} "[${lg_tag}] [${task}]"
                fi
            done
        fi
    done
done


# # # concat denoising data with summ data
DIR=$DATADIR/MSPM
lengthDIR=$DATADIR/MSPM/filtered
cat $DIR/denoise/en/train.en.spm.prefix.src $DIR/summ/en/train.en.spm.prefix.src $DIR/denoise/zh/train.zh.spm.prefix.src > $DIR/trainJoint.noshuffle.spm.src
cat $DIR/denoise/en/train.en.spm.prefix.tgt $DIR/summ/en/train.en.spm.prefix.tgt $DIR/denoise/zh/train.zh.spm.prefix.tgt > $DIR/trainJoint.noshuffle.spm.tgt

rm $DIR/dev*.src
rm $DIR/dev*.tgt
rm $DIR/test*.src
rm $DIR/test*.tgt

cp $DIR/summ/en/dev.en.spm.prefix.src $DIR/devEnSumm.noshuffle.spm.src
cp $DIR/summ/en/dev.en.spm.prefix.tgt $DIR/devEnSumm.noshuffle.spm.tgt

cp $DIR/denoise/en/dev.en.spm.prefix.src $DIR/devEnDenoise.noshuffle.spm.src
cp $DIR/denoise/en/dev.en.spm.prefix.tgt $DIR/devEnDenoise.noshuffle.spm.tgt

cp $DIR/denoise/zh/dev.zh.spm.prefix.src $DIR/devZhDenoise.noshuffle.spm.src
cp $DIR/denoise/zh/dev.zh.spm.prefix.tgt $DIR/devZhDenoise.noshuffle.spm.tgt

cp $DIR/summ/en/test.en.spm.prefix.src $DIR/testEnSumm.noshuffle.spm.src
cp $DIR/summ/en/test.en.spm.prefix.tgt $DIR/testEnSumm.noshuffle.spm.tgt

for field in trainJoint devEnSumm devEnDenoise devZhDenoise testEnSumm
do
    python3 ../filterByMaxSize.py $DIR/${field}.noshuffle.spm.src $DIR/${field}.noshuffle.spm.tgt $lengthDIR 1010
    shuf --random-source=<(get_seeded_random 66) $lengthDIR/${field}.noshuffle.spm.src > $DIR/${field}.spm.doc
    shuf --random-source=<(get_seeded_random 66) $lengthDIR/${field}.noshuffle.spm.tgt > $DIR/${field}.spm.sum
done

echo "Binary data ..."
BINDIR=$DATADIR/data-bin
bash binary.sh \
    --trainpref $DIR/trainJoint.spm \
    --validpref "$DIR/devEnSumm.spm,$DIR/devEnDenoise.spm,$DIR/devZhDenoise.spm" \
    --testpref $DIR/testEnSumm.spm \
    --destdir $BINDIR
