#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.


# data should be downloaded and processed with reprocess_RACE.py
if [[ $# -ne 4 ]]; then
  echo "Run as following:"
  echo "bash preTrans.sh <data_folder> <output_folder> <LG1> <LG2>"
  exit 1
fi

DATA=$1
OUT=$2
LG1=$3  # de_DE
LG2=$4  # en_XX

if [ ! -d "$OUT"  ] ; then
    mkdir -p "$OUT"
fi

MBART=/home/tiger/mbart/model/mbart.cc25
MODEL=$MBART/sentence.bpe.model
DICT=$MBART/dict.txt

INPUT1=$(echo $LG1 | cut -d _ -f 1)
INPUT2=$(echo $LG2 | cut -d _ -f 1)
echo $INPUT1 "->" $INPUT2

# SPLITS="dev train"
# for SPLIT in $SPLITS
#     do
#     echo "spm encoding $SPLIT.$INPUT1"
#     python fairseq/scripts/spm_encode.py --model=$MODEL < "$DATA/$SPLIT.$INPUT1" | sed -e "s/$/ [$LG1]/g" > "$DATA/$SPLIT.spm.$INPUT1"
#     echo "spm encoding $SPLIT.$INPUT2"
#     python fairseq/scripts/spm_encode.py --model=$MODEL < "$DATA/$SPLIT.$INPUT2" | sed -e "s/$/ [$LG2]/g" > "$DATA/$SPLIT.spm.$INPUT2"
#     done

python fairseq/preprocess.py \
  --source-lang $INPUT1 \
  --target-lang $INPUT2 \
  --trainpref "$DATA/train.spm" \
  --validpref "$DATA/dev.spm" \
  --destdir "$OUT" \
  --workers 70 \
  --srcdict $DICT \
  --tgtdict $DICT

python fairseq/preprocess.py \
  --source-lang $INPUT2 \
  --target-lang $INPUT1 \
  --trainpref "$DATA/train.spm" \
  --validpref "$DATA/dev.spm" \
  --destdir "$OUT" \
  --workers 70 \
  --srcdict $DICT \
  --tgtdict $DICT
