CUDA_VISIBLE_DEVICES=3 python train.py \
    data-bin/iwslt14.tokenized.de-en \
    --max-epoch 80\
    --arch transformer_iwslt_de_en  --share-decoder-input-output-embed\
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --max-tokens 4096 \
    --eval-bleu \
    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
    --eval-bleu-detok moses \
    --eval-bleu-remove-bpe \
    --n_centroid 3 \
    --truncate-source \
    --max-source-positions 96 \
    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
    --cluster_loss_weight 1 --sort_loss_weight 1 \
    --save-dir checkpoints/iwslt_de_en_cluster_ml-96_nc-3_innerlossw-lyle-lyle_seed-42\
    --no-epoch-checkpoints --seed 42


fairseq-generate data-bin/iwslt14.tokenized.de-en \
    --path checkpoints/iwslt_de_en_cluster_ml-96_nc-3_innerlossw-lyle-lyle_seed-42/checkpoint_best.pt \
    --batch-size 128 --beam 5 --remove-bpe --quiet --truncate-source  --max-source-positions 96
