#!/usr/bin/env bash

SCRIPTDIR="/opt/tiger/sumtest/multilingual/utils"

pwd=$(dirname $(readlink -f $0))
ccPrefix=/home/tiger/cc100/sample
summDIR=/home/tiger/summ/clean0702
DATAVER=En_Unsup_bartNoise_Doc
TMPDIR=~/tmp_afterTokenize

OUTDIR=~/$DATAVER
mkdir $TMPDIR

for split in train dev
do
    LG_TAG="en_XX"
    lg=en
    if [ ! -e ${OUTDIR}/${lg} ]; then
        mkdir -p ${OUTDIR}/${lg}
    fi
    INDIR=$summDIR/${lg}
    cp $INDIR/$split.$lg.doc $TMPDIR/$split.$lg.sum

    python3 ${SCRIPTDIR}/addNoise.py -m noiseBart -i $TMPDIR/$split.$lg.sum -o $TMPDIR/$split.$lg.doc -l ${lg} -d "<q>" -t
    cp ${TMPDIR}/$split.$lg.sum ${OUTDIR}/${lg}/$split.$lg.sum
    cp ${TMPDIR}/$split.$lg.doc ${OUTDIR}/${lg}/$split.$lg.doc
done

echo "Tokenizing data ..."
bash tokenize_and_filter.sh $OUTDIR en en_XX MSPM

echo "Binary data ..."
bash binary.sh $OUTDIR en MSPM

rm -r $TMPDIR
