#!/usr/bin/env bash

get_seeded_random()
{
  seed="$1"
  openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
    </dev/zero 2>/dev/null
}

pwd=$(dirname $(readlink -f $0))
ccPrefix=/opt/tiger/sumtest/cc100
summDIR=/home/tiger/summ
DATAVER=ZhEnFr_Unsup_infillNoise_r0.35_maxlen256
langs=(en zh fr)

declare -A langtoken_dict
langtoken_dict=([en]="en_XX" [fr]="fr_XX" [zh]="zh_CN")

TMPDIR=~/tmp_afterTokenize_${DATAVER}
mkdir $TMPDIR

INDIR=${ccPrefix}
OUTDIR=$TMPDIR/MSPM
mkdir $OUTDIR

for split in dev
do
    for lg in ${langs[*]}
    do
        LG_TAG=${langtoken_dict[$lg]}
        # echo "[${split} $lg ${LG_TAG}] adding noise ..."
        # python3 ../addNoise.py -m removeLongInstance -i ${INDIR}/${lg}.${split}.spm -os ${TMPDIR}/${lg}.${split}.spm.short --max-length 256 -l ${lg} -t
        # python3 ../addNoise.py -m noiseV2 -i ${TMPDIR}/${lg}.${split}.spm.short -os ${TMPDIR}/${lg}.${split}.spm.src -ot ${TMPDIR}/${lg}.${split}.spm.tgt -l ${lg} -t -r 0.35
        # cat ${TMPDIR}/${lg}.${split}.spm.src | sed -e "s/<q>/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${LG_TAG}] /" \
        #     > ${OUTDIR}/${lg}.${split}.spm.src
        # cat ${TMPDIR}/${lg}.${split}.spm.tgt | sed -e "s/<q>/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${LG_TAG}] /" \
        #     > ${OUTDIR}/${lg}.${split}.spm.tgt

        cp ${OUTDIR}/${lg}.${split}.spm.src ${OUTDIR}/${lg}.${split}.spm.doc
        cp ${OUTDIR}/${lg}.${split}.spm.tgt ${OUTDIR}/${lg}.${split}.spm.sum
        echo "Binary ..."
        bash binary_devPerLang.sh $TMPDIR $lg MSPM
        echo "moving $TMPDIR/data-bin/MSPM/${lg} to ${ccPrefix}/data-bin/MSPM/"
        mv $TMPDIR/data-bin/MSPM/${lg} ${ccPrefix}/data-bin/MSPM/
    done
done
# rm -r $TMPDIR