WORKSPACE=/workspace
TOOLS=$WORKSPACE/tools
SCRIPTS=$TOOLS/mosesdecoder-master/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl

source $WORKSPACE/bashrc
BPEROOT=$TOOLS/subword-nmt
CODES=$WORKSPACE/codes/git-repo/fairseq-master
ORI_DATA=/workspace/datas/opus-100/opus-100-corpus/v1.0

supervised_data=$ORI_DATA/supervised
zero_data=$ORI_DATA/zero-shot

curr_dir=.
bpe_nums=32000
bpe_codes=../opus_naive_mnmt/opus.${bpe_nums}.codes
bpe_data=../opus_naive_mnmt/data_combine

cp -r ../opus_naive_mnmt/data-bin ./

 python ./get_file_list.py $zero_data/multi-way

echo "apply bpe for train sets"
while read line
do 
    lang=${line: -2}
    prefix=${line: -8: -3}

    cat $line | \
        perl $NORM_PUNC $lang | \
        perl $REM_NON_PRINT_CHAR | \
        perl $TOKENIZER -threads 8 -a -l $lang >>${curr_dir}/train.${prefix}.tok.${lang}
    sh $TOOLS/sentencepiece-tool/encode_rawText2sp.sh ${bpe_codes}.model ${curr_dir}/train.${prefix}.tok.${lang} ${curr_dir}/train.${prefix}.tok.bpe.${lang}
done < train_file.txt

while read line
do 
    lang=${line: -2}
    prefix=${line: -14: -9}
    src_lang=${prefix: 0: 2}
    tgt_lang=${prefix: 3}
    if [ ! -f "dev.${prefix}.tok.bpe.${src_lang}" ]; then
        echo "cp $prefix"
        cp ../opus_naive_mnmt/dev.de-en.tok.bpe.en dev.${prefix}.tok.bpe.${src_lang}
        cp ../opus_naive_mnmt/dev.de-en.tok.bpe.de dev.${prefix}.tok.bpe.${tgt_lang}
    fi
done < train_file.txt

while read line
do 
    prefix=${line: -8: -3}
    src_lang=${prefix: 0: 2}
    tgt_lang=${prefix: 3}

    echo $prefix ${src_lang} ${tgt_lang}

    python $CODES/preprocess.py --source-lang $src_lang --target-lang $tgt_lang \
        --task multilingual_translation \
        --trainpref $curr_dir/train.${prefix}.tok.bpe --validpref $curr_dir/dev.${prefix}.tok.bpe \
        --destdir ./data-bin \
        --srcdict ${bpe_data}.bpe.voc \
        --joined-dictionary \
        --workers 10 
done < train_file.txt
