WORKSPACE=/workspace
TOOLS=$WORKSPACE/tools
SCRIPTS=$TOOLS/mosesdecoder-master/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl

source $WORKSPACE/bashrc
BPEROOT=$TOOLS/subword-nmt
CODES=$WORKSPACE/codes/git-repo/NMT_with_pretraining_underDev
ORI_DATA=/datas/opus-100/opus-100-corpus/v1.0
supervised_data=$ORI_DATA/supervised

src=en
tgt=zh
data_dir=$supervised_data/en-zh
test_dir=$ORI_DATA/test

curr_dir=`pwd`
bpe_nums=10000
bpe_codes=$src-$tgt.codes
bpe_data=data_combine
sep_token='<sep>'

cat ${data_dir}/opus.en-zh-train.en | \
    perl $NORM_PUNC 'en' | \
    perl $REM_NON_PRINT_CHAR | \
    perl $TOKENIZER -threads 8 -a -l 'en' >> ${curr_dir}/train.en-zh.tok.en

rm -rf ${bpe_data}

cp ${data_dir}/opus.en-zh-train.zh ${curr_dir}/train.en-zh.tok.zh

python $TOOLS/remove_blank_in_para_data.py ${curr_dir}/train.en-zh.tok.en ${curr_dir}/train.en-zh.tok.zh 
mv ${curr_dir}/train.en-zh.tok.en.BlankRem ${curr_dir}/train.en-zh.tok.en
mv ${curr_dir}/train.en-zh.tok.zh.BlankRem ${curr_dir}/train.en-zh.tok.zh

cat ${curr_dir}/train.en-zh.tok.en >>$bpe_data
cat ${curr_dir}/train.en-zh.tok.zh >>$bpe_data

sh $TOOLS/sentencepiece-tool/learn_sentencepiece.sh ${bpe_data} ${curr_dir}/${bpe_codes} $bpe_nums
sh $TOOLS/sentencepiece-tool/encode_rawText2sp.sh ${curr_dir}/${bpe_codes}.model ${curr_dir}/train.en-zh.tok.zh ${curr_dir}/train.en-zh.bpe.zh
sh $TOOLS/sentencepiece-tool/encode_rawText2sp.sh ${curr_dir}/${bpe_codes}.model ${curr_dir}/train.en-zh.tok.en ${curr_dir}/train.en-zh.bpe.en

python $TOOLS/build_fairseq_dict.py ${curr_dir}/train.en-zh.bpe.zh

python random_insert_delete.py $curr_dir/train.en-zh.bpe.zh.voc \
                               $curr_dir/train.en-zh.bpe.en $curr_dir/train.en-zh.bpe.zh \
                               $curr_dir/ins_del.train.$src $curr_dir/ins_del.train.$tgt \
                               $curr_dir/ins_del.train.truth

python merge_file_with_sep.py $curr_dir/ins_del.train.${tgt} $curr_dir/ins_del.train.${src} $curr_dir/ins_del.train.truth $sep_token ${curr_dir}/train.${src} ${curr_dir}/train.${tgt}

# for valid
for l in $src $tgt; do
    cat $data_dir/opus.en-zh-dev.$l | \
    perl $NORM_PUNC $l | \
    perl $REM_NON_PRINT_CHAR | \
    perl $TOKENIZER -threads 8 -a -l $l >>${curr_dir}/valid.tok.${l}
    sh $TOOLS/sentencepiece-tool/encode_rawText2sp.sh ${curr_dir}/${bpe_codes}.model ${curr_dir}/valid.tok.${l} ${curr_dir}/valid.tok.bpe.${l}
    
done

python random_insert_delete.py $curr_dir/train.en-zh.bpe.zh.voc \
                               $curr_dir/valid.tok.bpe.$src $curr_dir/valid.tok.bpe.$tgt \
                               $curr_dir/ins_del.valid.$src $curr_dir/ins_del.valid.$tgt \
                               $curr_dir/ins_del.valid.truth

python merge_file_with_sep.py $curr_dir/ins_del.valid.${tgt} $curr_dir/ins_del.valid.${src} $curr_dir/ins_del.valid.truth $sep_token ${curr_dir}/valid.bpe.${src} $curr_dir/valid.bpe.${tgt}

python $CODES/preprocess.py --source-lang ${src} --target-lang ${tgt} \
    --task multilingual_translation \
    --trainpref $curr_dir/train --validpref $curr_dir/valid.bpe \
    --destdir ./data-bin \
    --joined-dictionary \
    --workers 10 
