fairseq-preprocess \
--user-dir ./prophetnet \
--task translation_prophetnet \
--source-lang src --target-lang tgt \
--trainpref data/POS/train --validpref data/POS/valid --testpref data/POS/test \
--destdir data/POS/processed --srcdict ./vocab.txt --tgtdict ./vocab.txt \
--workers 40

_DATA
echo $SAVE_DIR
echo 'GPU' $GPU_NUM
ulimit -n 655350
DATA_DIR=data/POS/processed
USER_DIR=./prophetnet
ARCH=ngram_transformer_prophet_large
CRITERION=ngram_language_loss
SAVE_DIR=

PRETRAINED_MODEL=prophetnet_large_pretrained_160G_14epoch_model.pt
NUM_DATA=$(cat data/POS/train.src | wc -l)
GPU_NUM=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) 

echo $DATA_DIR, $NUM
MAX_SENTENCE=1
UPDAETS_PER_BATCH=$(($NUM_DATA/32/16))
EVAL_STEPS=$(($UPDAETS_PER_BATCH/4))
UPDATE_FREQ=$((32*16/$MAX_SENTENCE/$GPU_NUM))
echo 'Max Sentence' $MAX_SENTENCE
echo 'NUM_DATA', $NUM_DATA
echo 'EVAL_STEPS', $EVAL_STEPS
echo 'UPDAETS_PER_EPOCH', $UPDAETS_PER_BATCH
PYTHONIOENCODING=utf8 
# 
fairseq-train $DATA_DIR \
--user-dir $USER_DIR --task translation_prophetnet --arch $ARCH \
--optimizer adam --adam-betas '(0.9, 0.999)' --clip-norm 0.1 \
--lr 0.0001 \
--lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 1000 \
--dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
--criterion $CRITERION --label-smoothing 0.1 \
--update-freq $UPDATE_FREQ  --max-sentences $MAX_SENTENCE \
--num-workers 4 \
--load-from-pretrained-model $PRETRAINED_MODEL \
--load-sep \
--ddp-backend=no_c10d --max-epoch 10 \
--max-source-positions 512 --max-target-positions 512 \
--truncate-source \
--skip-invalid-size-inputs-valid-test \
--save-interval-updates $EVAL_STEPS \
--keep-interval-updates 20 \
--seed 666 \
--save-dir $SAVE_DIR \
--keep-last-epochs 0
--fp16