#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.


# data should be downloaded and processed with util/makeOption.py
if [[ $# -ne 2 ]]; then
  echo "Run as following:"
  echo "bash preRank.sh <data_folder> <output_folder>"
  exit 1
fi

DATA=$1
OUT=$2

MBART=/home/tiger/mbart/model/mbart.cc25
MODEL=$MBART/sentence.bpe.model
DICT=$MBART/dict.txt

SPLITS="dev train"
INPUT_TYPES="input0 input1 input2 input3 input4"
for INPUT_TYPE in $INPUT_TYPES
do
  for SPLIT in $SPLITS
      do
      echo "spm encoding $SPLIT/$INPUT_TYPE"
      python fairseq/scripts/spm_encode.py --model=$MODEL < "$DATA/$SPLIT.$INPUT_TYPE" | sed -e "s/<\/ s >/ <\/s>/g" > "$DATA/$SPLIT.$INPUT_TYPE.spm"
      done
done

for INPUT_TYPE in $INPUT_TYPES
    do
      LANG="input$INPUT_TYPE"
      python fairseq/preprocess.py \
        --only-source \
        --trainpref "$DATA/train.$INPUT_TYPE.spm" \
        --validpref "$DATA/dev.$INPUT_TYPE.spm" \
        --destdir "$OUT/$INPUT_TYPE" \
        --workers 15 \
        --srcdict $DICT;
done

rm -rf "$OUT/label"
mkdir -p "$OUT/label"
cp "$DATA/train.label" "$OUT/label/"
cp "$DATA/dev.label" "$OUT/label/valid.label"
