
tmpdir=~/concat_sup_unsup/
if [ ! -e $tmpdir ]; then
    mkdir -p $tmpdir
fi

rootdir=/opt/tiger/sumtest/tmpdata
MODE="MSPM"
DICT="/home/tiger/mbart.cc25/dict_extend.txt"

get_seeded_random()
{
  seed="$1"
  openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
    </dev/zero 2>/dev/null
}

# collect data
for split in train dev test
do
    cp $rootdir/${split}.*.en $tmpdir/
    cp $rootdir/${split}.y.en $tmpdir/${split}.y.en_unsup
    cp $rootdir/${split}.y.en $tmpdir/${split}.x.en_unsup
    cp $rootdir/trans_shanbo/${split}.y.en.2zh $tmpdir/${split}.x.zh_unsup
    cp $rootdir/trans_shanbo/${split}.y.en.2zh $tmpdir/${split}.y.zh_unsup
done

bash xgiga_predata.sh $tmpdir en en_XX MSPM
bash xgiga_predata.sh $tmpdir en_unsup en_XX MSPM
bash xgiga_predata.sh $tmpdir zh_unsup zh_CN MSPM

# monolingual
tag="sup_en_unsup_enzh"
INDIR=$tmpdir/MSPM
OUTDIR=$tmpdir/MSPM/$tag
if [ ! -e ${OUTDIR} ]; then
  mkdir -p ${OUTDIR}
fi
for split in train dev test
do
  cat $INDIR/en/${split}.en.spm.doc $INDIR/en_unsup/${split}.en_unsup.spm.doc $INDIR/zh_unsup/${split}.zh_unsup.spm.doc > \
    ${OUTDIR}/${split}.${tag}.spm.doc.noshuffle
  cat $INDIR/en/${split}.en.spm.sum $INDIR/en_unsup/${split}.en_unsup.spm.sum $INDIR/zh_unsup/${split}.zh_unsup.spm.sum > \
    ${OUTDIR}/${split}.${tag}.spm.sum.noshuffle
  echo "shuffling"
  shuf --random-source=<(get_seeded_random 66) ${OUTDIR}/${split}.${tag}.spm.doc.noshuffle > ${OUTDIR}/${split}.${tag}.spm.doc
  shuf --random-source=<(get_seeded_random 66) ${OUTDIR}/${split}.${tag}.spm.sum.noshuffle > ${OUTDIR}/${split}.${tag}.spm.sum
done

INPUT="$OUTDIR"
BINDIR="$tmpdir/data-bin/$MODE/$tag"
if [[ "$MODE" == "SPM" ]] || [[ "$MODE" == "MSPM" ]]; then
  TOKENTYPE="spm"
else
  TOKENTYPE="bpe"
fi

echo "Binarized $INPUT ($TOKENTYPE) to $BINDIR with dict $DICT"

python3 fairseq/preprocess.py \
  --source-lang doc \
  --target-lang sum \
  --trainpref "$INPUT/train.$tag.$TOKENTYPE" \
  --validpref "$INPUT/dev.$tag.$TOKENTYPE" \
  --testpref "$INPUT/test.$tag.$TOKENTYPE"  \
  --destdir "$BINDIR" \
  --srcdict "$DICT" \
  --tgtdict "$DICT" \
  --workers 70
