
DIR=/opt/tiger/sumtest/tmpdata/trans_shanbo/MSPM
MODE="MSPM"
DICT="/home/tiger/mbart.cc25/dict_extend.txt"

get_seeded_random()
{
  seed="$1"
  openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
    </dev/zero 2>/dev/null
}

# monolingual
tag="parallel"
OUTDIR=${DIR}/${tag}
if [ ! -e ${OUTDIR} ]; then
  mkdir -p ${OUTDIR}
fi
for split in train dev test
do
  cat $DIR/zh/${split}.zh.spm.doc $DIR/en/${split}.en.spm.doc > ${OUTDIR}/${split}.${tag}.spm.doc.noshuffle
#   cat $DIR/en/${split}.en.spm.sum $DIR/zh/${split}.zh.spm.sum > ${OUTDIR}/${split}.${tag}.spm.sum.noshuffle
  echo "shuffling"
  shuf --random-source=<(get_seeded_random 66) ${OUTDIR}/${split}.${tag}.spm.doc.noshuffle > ${OUTDIR}/${split}.${tag}.spm.doc
#   shuf --random-source=<(get_seeded_random 66) ${OUTDIR}/${split}.${tag}.spm.sum.noshuffle > ${OUTDIR}/${split}.${tag}.spm.sum
done

INPUT="$OUTDIR"
BINDIR="$DIR/data-bin/$MODE/$tag"
if [[ "$MODE" == "SPM" ]] || [[ "$MODE" == "MSPM" ]]; then
  TOKENTYPE="spm"
else
  TOKENTYPE="bpe"
fi

echo "Binarized $INPUT ($TOKENTYPE) to $BINDIR with dict $DICT"

python3 fairseq/preprocess.py \
  --source-lang doc \
  --target-lang sum \
  --trainpref "$INPUT/train.$tag.$TOKENTYPE" \
  --destdir "$BINDIR" \
  --srcdict "$DICT" \
  --tgtdict "$DICT" \
  --workers 70
