WORKSPACE=/workspace
TOOLS=$WORKSPACE/tools
SCRIPTS=$TOOLS/mosesdecoder-master/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl

source $WORKSPACE/bashrc
BPEROOT=$TOOLS/subword-nmt
CODES=$WORKSPACE/codes/git-repo/fairseq-master
ORI_DATA=/workspace/datas/opus-100/opus-100-corpus/v1.0/zero-shot/multi-way-synthetic/

src=en
tgt=zh
data_dir=$ORI_DATA/ar-zh
test_dir=$ORI_DATA/test

curr_dir=.
bpe_codes=../$src-$tgt.codes.model
sep_token='<sep>'

sh $TOOLS/sentencepiece-tool/encode_rawText2sp.sh ${bpe_codes} ${data_dir}/syn_multi_way_src_len_zh-ar.aren_0.4  ${curr_dir}/aren.sp

sh $TOOLS/sentencepiece-tool/encode_rawText2sp.sh ${bpe_codes} ${data_dir}/syn_multi_way_src_len_zh-ar.zh_0.4  ${curr_dir}/zh.sp


python merge_file_with_sep.py $curr_dir/aren.sp $curr_dir/zh.sp $sep_token ${curr_dir}/src.bpe.${src}

mkdir -p ${curr_dir}/data-bin

python $CODES/preprocess.py --source-lang ${src} --target-lang ${tgt} \
    --task multilingual_translation \
    --only-source \
    --srcdict ../data-bin/dict.${src}.txt \
    --trainpref $curr_dir/src.bpe \
    --destdir ${curr_dir}/data-bin \
    --workers 10 
