#!/usr/bin/env bash
# export NCCL_SOCKET_IFNAME=eth0
# export NCCL_DEBUG=INFO
# export NCCL_DEBUG_SUBSYS=ALL
TOKENS_PER_SAMPLE=512
WARMUP_UPDATES=10000
PEAK_LR=0.0001
MAX_SENTENCES=8 # 8
UPDATE_FREQ=2   # 2

ARCH=ngram_transformer_prophet_large
# ARCH=ngram_transformer_prophet_base
CRITERION=ngram_language_loss_L2RPLM
# DATADIR=processed
DATADIR=wikipedia_bookcorpus/processed
USER_DIR=P2DeNet_pretrain
SAVE_DIR=pretrain_checkpoint_jdnet_large
TENSORBOARD_LOGDIR=pretrain_tensorboard_jdnet_large

TOTAL_UPDATES=1500000
MAX_EPOCHES=192

echo 'pretrain starting ...'

CUDA_VISIBLE_DEVICES=0,1,2,3 python $USER_DIR/train.py ${DATADIR} \
    --user-dir ${USER_DIR} --arch ${ARCH} --fp16 \
    --task masked_s2s --criterion $CRITERION \
    --sample-break-mode none \
    --tokens-per-sample ${TOKENS_PER_SAMPLE} --max-sentences ${MAX_SENTENCES} --update-freq ${UPDATE_FREQ} \
    --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-6 --weight-decay 0.01 \
    --clip-norm 0.1 \
    --lr-scheduler inverse_sqrt --lr ${PEAK_LR} --warmup-updates ${WARMUP_UPDATES} \
    --max-update $TOTAL_UPDATES \
    --max-epoch $MAX_EPOCHES --save-interval-updates 100000 --keep-interval-updates 10 \
    --dropout 0.1 --attention-dropout 0.1 --activation-dropout 0.1 \
    --ddp-backend=no_c10d \
    --num-workers 4 \
    --save-dir ${SAVE_DIR} \
    --skip-invalid-size-inputs-valid-test --seed 1 \
    --tensorboard-logdir ${TENSORBOARD_LOGDIR} \
    --reset-optimizer \
    --distributed-backend nccl --distributed-world-size 64 --distributed-rank 0 \
    --distributed-init-method 'tcp://10.207.176.244:4322' --distributed-port 4322 \

echo 'pretrain done'

# --use-bmuf