#!/usr/bin/env bash

ccDIR="/home/tiger/cc100"

# setup
wxz_prefix=hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0
pretrained_path=hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/pretrained
MBART=~/mbart.cc25
MODEL=$MBART/sentence.bpe.model
DICT=$MBART/dict_extend.txt


if [ ! -d ${MBART} ]; then
    echo "Load pretrained model from ${pretrained_path}/mbart.CC25.tar.gz to ~/" >&2
    hadoop fs -copyToLocal ${pretrained_path}/mbart.CC25.tar.gz ~/
    tar -xvzf ~/mbart.CC25.tar.gz -C ~/
else
    echo "Pretrained model in ${MBART}" >&2
fi

# tokenize
cd ../..
for split in train dev
do
    sampleDIR=$ccDIR/sample_${split}_doc
    for lg in de es zh
    do
        python3 fairseq/scripts/spm_encode.py --model=$MODEL < "${sampleDIR}/${lg}.doc.sample" | sed -e "s/< q >/ <q>/g" > "${sampleDIR}/${lg}.doc.spm.sample"
    done
done
