#!/usr/bin/env bash

pwd=$(dirname $(readlink -f $0))
INDIR=/home/tiger/cc100/samples
summDIR=/home/tiger/summ
DATAVER=EnFr_Summ_DeEsZh_Unsup

for lg in es de zh
do
    mv ${INDIR}/${lg}.sample.txt ${INDIR}/${lg}.sample
    python3 addNoise.py -m noiseV1 -i ${INDIR}/${lg}.sample -o ${INDIR}/${lg}.noise  -l ${lg}
done

echo "Tokenizing unsupervised data ..."
bash tokenize.sh ${INDIR} es es_XX MSPM
bash tokenize.sh ${INDIR} zh zh_CN MSPM
bash tokenize.sh ${INDIR} de de_DE MSPM

echo "Tokenizing summarization data ..."
bash tokenizeSumm.sh ${summDIR} en en_XX MSPM
bash tokenizeSumm.sh ${summDIR} fr fr_XX MSPM

bash concat.sh $summDIR/MSPM $INDIR/MSPM $DATAVER

echo "Binary data ..."
bash binary.sh $summDIR $DATAVER MSPM
