#!/usr/bin/env bash

get_seeded_random()
{
  seed="$1"
  openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
    </dev/zero 2>/dev/null
}

pwd=$(dirname $(readlink -f $0))
ccPrefix=/opt/tiger/sumtest/cc100
summDIR=/home/tiger/summ
DATAVER=ZhEn_Unsup_infillNoise

TMPDIR=~/tmp_afterTokenize
mkdir $TMPDIR

INDIR=${ccPrefix}
OUTDIR=$INDIR/MSPM
mkdir $OUTDIR

# for split in dev train
for split in dev
do
    echo "[${split}] Tokenizing unsupervised data ..."
    
    lg=en
    LG_TAG="en_XX"
    python3 ../addNoise.py -m noiseV2 -i ${INDIR}/${lg}.${split}.spm -os ${TMPDIR}/${lg}.${split}.spm.src -ot ${TMPDIR}/${lg}.${split}.spm.tgt -l ${lg}
    cat ${TMPDIR}/${lg}.${split}.spm.src | sed -e "s/<q>/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${LG_TAG}] /" \
        > ${OUTDIR}/${lg}.${split}.spm.src
    cat ${TMPDIR}/${lg}.${split}.spm.tgt | sed -e "s/<q>/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${LG_TAG}] /" \
        > ${OUTDIR}/${lg}.${split}.spm.tgt

    lg=zh
    LG_TAG="zh_CN"
    python3 ../addNoise.py -m noiseV2 -i ${INDIR}/${lg}.${split}.spm -os ${TMPDIR}/${lg}.${split}.spm.src -ot ${TMPDIR}/${lg}.${split}.spm.tgt -l ${lg}
    cat ${TMPDIR}/${lg}.${split}.spm.src | sed -e "s/<q>/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${LG_TAG}] /" \
        > ${OUTDIR}/${lg}.${split}.spm.src
    cat ${TMPDIR}/${lg}.${split}.spm.tgt | sed -e "s/<q>/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${LG_TAG}] /" \
        > ${OUTDIR}/${lg}.${split}.spm.tgt
    
    cat ${OUTDIR}/en.${split}.spm.src ${OUTDIR}/zh.${split}.spm.src > $OUTDIR/${split}.noshuffle.spm.doc
    cat ${OUTDIR}/en.${split}.spm.tgt ${OUTDIR}/zh.${split}.spm.tgt > $OUTDIR/${split}.noshuffle.spm.sum

    shuf --random-source=<(get_seeded_random 66) $OUTDIR/${split}.noshuffle.spm.doc > $OUTDIR/${split}.spm.doc
    shuf --random-source=<(get_seeded_random 66) $OUTDIR/${split}.noshuffle.spm.doc > $OUTDIR/${split}.spm.sum

done

echo "Binary data ..."
bash binary.sh $INDIR $DATAVER MSPM

rm -r $TMPDIR