#!/usr/bin/bash
WORKSPACE=/workspace
TOOLS_DIR=$WORKSPACE/tools
CODES_DIR=$WORKSPACE/codes/fairseq_master
SCRIPTS=$TOOLS_DIR/mosesdecoder-master/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
DETOKENIZER=$SCRIPTS/tokenizer/detokenizer.perl
NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
BPEROOT=$TOOLS_DIR/subword-nmt

data_dir=.

model_path=/workspace/models/checkpoint_last.pt

source_lang='ar'
target_lang='zh'
bpe_codes=./opus.32000.codes.model

export CUDA_VISIBLE_DEVICES='1'

lang_pairs=""
pre_prefix=""
langs="en"
while read line
do 
    prefix=${line: -14: -9}
    if [ "$prefix"x == "$pre_prefix"x ]; then
        #echo "same prefix"
        continue 2
    fi
    pre_prefix=$prefix
    src_lang=${prefix: 0: 2}
    tgt_lang=${prefix: 3}

    include=$(echo $langs | grep "${src_lang}")
    if [[ "$include" != "" ]]; then
        echo "src include"
    else
        langs=${langs}-${src_lang}
    fi

    include=$(echo $langs | grep "${tgt_lang}")
    if [[ "$include" != "" ]]; then

python ${CODES_DIR}/generate.py $data_dir/data-bin \
       --task multilingual_translation \
       --gen-subset train \
       --source-lang ${source_lang} \
       --target-lang ${target_lang} \
       --lang-pairs ${source_lang}-${target_lang} \
       --path $model_path \
       --beam 4 \
       --batch-size 128 \
       > ${data_dir}/bpe.out_noise_0.4

grep ^H ${data_dir}/bpe.out_noise_0.4 | cut -f3 >${data_dir}/bpe.out.noise0.4.clean

sh $TOOLS_DIR/sentencepiece-tool/decode_sp2rawText.sh ${bpe_codes} ${data_dir}/bpe.out.noise0.4.clean ${data_dir}/zh.out
