#!/usr/bin/env bash

ccDIR="/opt/tiger/sumtest/cc100"

# setup
wxz_prefix=hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0
pretrained_path=hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/pretrained
MBART=~/mbart.cc25
MODEL=$MBART/sentence.bpe.model
DICT=$MBART/dict_extend.txt


if [ ! -d ${MBART} ]; then
    echo "Load pretrained model from ${pretrained_path}/mbart.CC25.tar.gz to ~/" >&2
    hadoop fs -copyToLocal ${pretrained_path}/mbart.CC25.tar.gz ~/
    tar -xvzf ~/mbart.CC25.tar.gz -C ~/
else
    echo "Pretrained model in ${MBART}" >&2
fi

# tokenize
cd /opt/tiger/sumtest/crossLingualTransfer
for split in train dev
do
    for lg in en zh fr
    do
        python3 fairseq/scripts/spm_encode.py --model=$MODEL < "${ccDIR}/${lg}.${split}.txt" | sed -e "s/< q >/ <q>/g" > "${ccDIR}/${lg}.${split}.spm"
    done
done
