root=./
#export PYTHONPATH=$root:$PYTHONPATH
if [ ! $1 ]; then
    $1=''
fi

if [ ! $8 ]; then
    $8='at_wrong'
fi

SUFFIX=$1
model_signature=${SUFFIX}
enc_lr_scaling=$2
dec_lr_scaling=$3
switch_mode_num_update=$4
kl_strategy=$5
kl_loss_coef=$6
lambda_at=$7
strategy=$8
threshold=$9
export CUDA_VISIBLE_DEVICES=0,1,2,3
python ${root}/fairseq_cli/stage2_my_train.py \
    ${root}/data-bin/wmt14_en_de \
    --task my_translation \
    --mode joint3-at-only \
    --default_task_mode joint3-at-only \
    --default_model_mode at \
    --source-lang en --target-lang de \
    --arch my_at_nat_transformer --src_tgt_share_emb \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --save-dir ${root}/checkpoints_v7/wmt14_en_de/${model_signature} \
    --ddp-backend=no_c10d \
    --max-update 300000 --save-interval-updates 5000 --switch_mode_num_update ${switch_mode_num_update} \
    --keep-interval-updates 40 \
    --lr 7e-4 --enc_lr_scaling ${enc_lr_scaling} --dec_lr_scaling ${dec_lr_scaling} --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --weight-decay 0.0 \
    --dropout 0.1 \
    --criterion my_criterion --label-smoothing 0.1 \
    --encoder-normalize-before --at-decoder-normalize-before --nat-decoder-normalize-before \
    --max-tokens 8192 \
    --eval-bleu \
    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
    --eval-bleu-detok moses \
    --eval-bleu-remove-bpe \
    --eval-bleu-print-samples \
    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
    --encoder-layers 6 \
    --at-decoder-layers 6 \
    --nat-decoder-layers 6 \
    --apply-bert-init \
    --noise random_mask \
    --strategy ${strategy} --threshold ${threshold} --kl_strategy ${kl_strategy} --kl_loss_coef ${kl_loss_coef} --lambda_at ${lambda_at}
