WORKSPACE=/workspace
TOOLS=$WORKSPACE/tools
SCRIPTS=$TOOLS/mosesdecoder-master/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl

source $WORKSPACE/bashrc
BPEROOT=$TOOLS/subword-nmt
CODES=$WORKSPACE/codes/git-repo/fairseq-master
ORI_DATA=/workspace/datas/opus-100/opus-100-corpus/v1.0

supervised_data=$ORI_DATA/supervised
zero_data=$ORI_DATA/zero-shot

curr_dir=/workspace/experiments/experiments-opus-100/opus_naive_mnmt
bpe_nums=32000
bpe_codes=opus.${bpe_nums}.codes
bpe_data=${curr_dir}/data_combine

python get_file_list.py $supervised_data $zero_data

rm -rf $bpe_data

while read line
do 
    lang=${line: -2}
    prefix=${line: -14: -9}
    echo $lang
    echo $prefix
    cat $line | \
        perl $NORM_PUNC $lang | \
        perl $REM_NON_PRINT_CHAR | \
        perl $TOKENIZER -threads 8 -a -l $lang >>${curr_dir}/train.${prefix}.tok.${lang}
    cat ${curr_dir}/train.${prefix}.tok.${lang} >> $bpe_data
done < train_file.txt

echo "learn_bpe.py on ${bpe_data}..."
sh $TOOLS/sentencepiece-tool/learn_sentencepiece.sh ${bpe_data} ${curr_dir}/${bpe_codes} $bpe_nums
sh $TOOLS/sentencepiece-tool/encode_rawText2sp.sh ${curr_dir}/${bpe_codes}.model ${bpe_data} ${bpe_data}.bpe

echo "build dictionary"
python $TOOLS/build_fairseq_dict.py ${bpe_data}.bpe

echo "apply bpe for train sets"
while read line
do 
    lang=${line: -2}
    prefix=${line: -14: -9}
    tok_file=${curr_dir}/train.${prefix}.tok.${lang}
    sh $TOOLS/sentencepiece-tool/encode_rawText2sp.sh ${curr_dir}/${bpe_codes}.model ${curr_dir}/train.${prefix}.tok.${lang} ${curr_dir}/train.${prefix}.tok.bpe.${lang}
done < ${curr_dir}/train_file.txt

echo "process dev sets"

while read line
do 
    lang=${line: -2}
    prefix=${line: -12: -7}
    echo $lang
    echo $prefix
    cat $line | \
        perl $NORM_PUNC $lang | \
        perl $REM_NON_PRINT_CHAR | \
        perl $TOKENIZER -threads 8 -a -l $lang >>${curr_dir}/dev.${prefix}.tok.${lang}
    sh $TOOLS/sentencepiece-tool/encode_rawText2sp.sh ${curr_dir}/${bpe_codes}.model ${curr_dir}/dev.${prefix}.tok.${lang} ${curr_dir}/dev.${prefix}.tok.bpe.${lang}
done < ${curr_dir}/dev_file.txt

echo "process dev sets"
while read line
do 
    lang=${line: -2}
    prefix=${line: -13: -8}
    echo $lang
    echo $prefix
    cat $line | \
        perl $NORM_PUNC $lang | \
        perl $REM_NON_PRINT_CHAR | \
        perl $TOKENIZER -threads 8 -a -l $lang >>${curr_dir}/test.${prefix}.tok.${lang}
    sh $TOOLS/sentencepiece-tool/encode_rawText2sp.sh ${curr_dir}/${bpe_codes}.model ${curr_dir}/test.${prefix}.tok.${lang} ${curr_dir}/test.${prefix}.tok.bpe.${lang}
done < ${curr_dir}/test_file.txt

while read line
do 
    lang=${line: -2}
    prefix=${line: -14: -9}
    src_lang=${prefix: 0: 2}
    tgt_lang=${prefix: 3}
    if [ ! -f "dev.${prefix}.tok.bpe.${src_lang}" ]; then
        echo "cp $prefix"
        cp dev.de-en.tok.bpe.en dev.${prefix}.tok.bpe.${src_lang}
        cp dev.de-en.tok.bpe.de dev.${prefix}.tok.bpe.${tgt_lang}
    fi
done < ${curr_dir}/train_file.txt

echo "processing training sets and binarize"
pre_prefix=""
while read line
do 
    prefix=${line: -14: -9}
    if [ ${pre_prefix} == ${prefix} ]; then
        continue 2
    fi
    pre_prefix=$prefix
    src_lang=${prefix: 0: 2}
    tgt_lang=${prefix: 3}

    echo $prefix ${src_lang} ${tgt_lang}

    python $CODES/preprocess.py --source-lang $src_lang --target-lang $tgt_lang \
        --task multilingual_translation \
        --trainpref $curr_dir/train.${prefix}.tok.bpe --validpref $curr_dir/dev.${prefix}.tok.bpe \
        --destdir ./data-bin \
        --srcdict ${bpe_data}.bpe.voc \
        --joined-dictionary \
        --workers 10 
done < ${curr_dir}/train_file.txt
