input_file=$1
vocab_size=$2
volt=$3

spm_train_path="/home/song/scripts/spm_train.py"
spm_encode_path="/home/song/scripts/spm_encode.py"
volt_path='/home/song/git/VOLT/ot_run.py'

# file parameters
input_folder=${input_file%/*}
vocab_spm_prefix="${input_folder}/vocab.${vocab_size}.spm"
vocab_spm_model="${vocab_spm_prefix}.model"
vocab_spm_vocab="${vocab_spm_prefix}.vocab"
spm_output_file="${input_file}.${vocab_size}.spm"

vocab_volt_vocab="${input_folder}/vocab.volt"
vocab_volt_size="${input_folder}/vocab.volt.size"


function update_vocab()
{
    input_folder=${input_file%/*}
    vocab_spm_prefix="${input_folder}/vocab.${vocab_size}.spm"
    vocab_spm_model="${vocab_spm_prefix}.model"
    vocab_spm_vocab="${vocab_spm_prefix}.vocab"
    spm_output_file="${input_file}.${vocab_size}.spm"
}

function spm()
{
    python3 ${spm_train_path} --input=${input_file} --model_prefix=${vocab_spm_prefix} --vocab_size=${vocab_size} --character_coverage=1.0 --model_type=bpe
    sed -i 's/\t/ /g' ${vocab_spm_vocab}
    python3 ${spm_encode_path} --model ${vocab_spm_model} --inputs ${input_file} --outputs ${spm_output_file} --output_format piece
}

# if volt==0 we use the vocab size otherwise we set vocab size to 110k
if [ $volt -eq 0 ]; then
    spm
else
    vocab_size=40000
    update_vocab
    spm
    python3 ${volt_path} --source_file ${spm_output_file} --token_candidate_file ${vocab_spm_vocab} --vocab_file ${vocab_volt_vocab} --max_number ${vocab_size} --interval 1000 --loop_in_ot 500 --tokenizer sentencepiece --size_file ${vocab_volt_size}
    vocab_size=$(cat ${vocab_volt_size})
    update_vocab
    spm
    vocab_volt_final_vocab="${input_folder}/vocab.volt"
    spm_output_final_file="${input_file}.spm.volt"
    cp ${vocab_spm_vocab} ${vocab_volt_final_vocab}
    cp ${spm_output_file} ${spm_output_file_final}
fi
