#!/usr/bin/env bash


if [[ $# -ne 3 ]]; then
  echo "Run as following:"
  echo "bash preShuffle.sh <data_folder> <output_folder> <LG>"
  exit 1
fi

MBART=/home/tiger/trans/model/mbart.cc25
MODEL=$MBART/sentence.bpe.model
DICT=$MBART/dict.txt

DATADIR="$1"
OUTDIR="$2"
LG="$3"

echo "Get lead from " ${DATADIR}
python trans/getLead.py -i $DATADIR/train.$LG.doc 
python trans/getLead.py -i $DATADIR/dev.$LG.doc 

echo "Translate lead file..."
cd trans || return
bash translead.sh $DATADIR/dev.$LG.lead2 $LG
bash translead.sh $DATADIR/train.$LG.lead2 $LG

echo "Merge doc and lead file..."
cd .. || return
python trans/MergeShuffle.py -i $DATADIR/train.$LG.doc10  -s $DATADIR/train.$LG.lead2.multi
python trans/MergeShuffle.py -i $DATADIR/dev.$LG.doc10  -s $DATADIR/dev.$LG.lead2.multi

echo "Filter empty file..."
python utils/filterempty.py $DATADIR/train.$LG.shuffle.doc10  $DATADIR/train.$LG.lead2 $OUTDIR
python utils/filterempty.py $DATADIR/dev.$LG.shuffle.doc10  $DATADIR/dev.$LG.lead2 $OUTDIR


TOKEN=$OUTDIR
python fairseq/scripts/spm_encode.py --model=$MODEL < "$OUTDIR/dev.$LG.shuffle.doc10" | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" > "$TOKEN/dev.$LG.spm.doc"
python fairseq/scripts/spm_encode.py --model=$MODEL < "$OUTDIR/dev.$LG.lead2" | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" > "$TOKEN/dev.$LG.spm.sum"
python fairseq/scripts/spm_encode.py --model=$MODEL < "$OUTDIR/train.$LG.shuffle.doc10" | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" > "$TOKEN/train.$LG.spm.doc"
python fairseq/scripts/spm_encode.py --model=$MODEL < "$OUTDIR/train.$LG.lead2" | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" > "$TOKEN/train.$LG.spm.sum"


INPUT=$TOKEN
python fairseq/preprocess.py \
    --source-lang doc \
    --target-lang sum \
    --trainpref "$INPUT/train.$LG.spm" \
    --validpref "$INPUT/dev.$LG.spm" \
    --destdir "$INPUT/bin" \
    --srcdict "$DICT" \
    --tgtdict "$DICT" \
    --workers 70