LANGUAGE=$1
MODEL_TYPE=$2
DATA_TYPE=$3
LANGUAGE_ID=$4
SPM=../sentencepiece/build/src/spm_encode
MODEL=../models/mbart.cc25.v2/sentence.bpe.model
DATA=../models/${LANGUAGE}/${MODEL_TYPE}/${DATA_TYPE}-data
TRAIN=train
VALID=valid
DICT=../models/mbart.cc25.v2/dict.txt

SRC=input
TGT=target

${SPM} --model=${MODEL} < ${DATA}/${TRAIN}.${SRC} > ${DATA}/${TRAIN}.spm.${SRC} 
${SPM} --model=${MODEL} < ${DATA}/${TRAIN}.${TGT} > ${DATA}/${TRAIN}.spm.${TGT}
${SPM} --model=${MODEL} < ${DATA}/${VALID}.${SRC} > ${DATA}/${VALID}.spm.${SRC}
${SPM} --model=${MODEL} < ${DATA}/${VALID}.${TGT} > ${DATA}/${VALID}.spm.${TGT}

fairseq-preprocess --source-lang ${SRC} --target-lang ${TGT} --destdir ../models/${LANGUAGE}/${MODEL_TYPE}/${DATA_TYPE}-bin --thresholdtgt 0    --thresholdsrc 0   --srcdict ${DICT}   --tgtdict ${DICT}   --workers 70 --trainpref ${DATA}/${TRAIN}.spm --validpref ${DATA}/${VALID}.spm


