## Environment

The recommended way to run the code is using docker (Apex has been installed):
```bash
docker run -it --rm --runtime=nvidia --ipc=host --privileged hangbo/pytorch:1.2.0-cuda10-apex bash
```

Install the repo as a package:
```bash
git clone this repo into ${code_dir}

cd ${code_dir} ; pip install --editable .
```

## GLUE Finetuning

 The [General Language Understanding Evaluation (GLUE)](https://gluebenchmark.com/) benchmark is a collection of nine sentence- or sentence-pair language understanding tasks for evaluating and analyzing natural language understanding systems. 

 You can download the [GLUE data](https://gluebenchmark.com/tasks) by running [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) and unpack it to some directory.

 Once the model checkpoints and the data is ready, you can follow the following commands to start the finetuning.

### MNLI

#### Finetuning

 ```shell
 # Set path to read training/dev dataset that you save in last step
export DATASET_PATH=/path/to/read/glue/task/data/            # Example: "/path/to/downloaded-glue-data-dir/mnli/"

# Set path to save the finetuned model and result score
export OUTPUT_PATH=/path/to/save/result_of_finetuning

export TASK_NAME=mnli

# Set path to the model checkpoint you need to test 
export CKPT_PATH=/path/to/your/model/checkpoint

# Set max sequence length
export MAX_LEN=128

# Set config file
export CONFIG_FILE=./unilm/configs/unilm3-base-cased.json

# Set path to cache train & dev features (tokenized, only use for this tokenizer!)
export TRAIN_CACHE=${DATASET_PATH}/$TASK_NAME.UNILM3_base_cased.$MAX_LEN.cache
export DEV_CACHE=${DATASET_PATH}/$TASK_NAME.UNILM3_base_cased.$MAX_LEN.cache

export PYTORCH_PRETRAINED_BERT_CACHE=/path/to/bert-cased-pretrained-cache/

# Setting the hyperparameters for the run.
export BSZ=32
export LR=7e-6
export EPOCH=5
export WD=0.1
export WM=0.1
CUDA_VISIBLE_DEVICES=0 python nlu_finetune/run_classifier.py \
    --model_type unilm --model_name_or_path $CKPT_PATH --task_name $TASK_NAME \
    --data_dir $DATASET_PATH --cached_train_file $TRAIN_CACHE --cached_dev_file $DEV_CACHE \
    --config_name $CONFIG_FILE --tokenizer_name unilm3-base-cased \
    --do_train --evaluate_during_training --logging_steps 1000 --output_dir $OUTPUT_PATH --max_grad_norm 0 \
    --max_seq_length $MAX_LEN --per_gpu_train_batch_size $BSZ --learning_rate $LR \
    --num_train_epochs $EPOCH --weight_decay $WD --warmup_ratio $WM \
    --fp16_init_loss_scale 128.0 --adam_epsilon 1e-6 --adam_betas "0.9,0.98" \
    --dropout_prob 0.05 --cls_dropout_prob 0.05 \
    --fp16 --fp16_opt_level O2 --seed 1234
 ```

### RTE

#### Finetuning

 ```shell
 # Set path to read training/dev dataset that you save in last step
export DATASET_PATH=/path/to/read/glue/task/data/            # Example: "/path/to/downloaded-glue-data-dir/rte/"

# Set path to save the finetuned model and result score
export OUTPUT_PATH=/path/to/save/result_of_finetuning

export TASK_NAME=rte

# Set path to the model checkpoint you need to test 
export CKPT_PATH=/path/to/your/model/checkpoint

# Set config file
export CONFIG_FILE=./unilm/configs/unilm3-base-cased.json

# Set max sequence length
export MAX_LEN=256

# Set path to cache train & dev features (tokenized, only use for this tokenizer!)
export TRAIN_CACHE=${DATASET_PATH}/$TASK_NAME.UNILM3_base_cased.$MAX_LEN.cache
export DEV_CACHE=${DATASET_PATH}/$TASK_NAME.UNILM3_base_cased.$MAX_LEN.cache

export PYTORCH_PRETRAINED_BERT_CACHE=/path/to/bert-cased-pretrained-cache/

# Setting the hyperparameters for the run.
export BSZ=8
export LR=1.5e-5
export EPOCH=15
export WD=0.1
export WM=0.2
CUDA_VISIBLE_DEVICES=0 python nlu_finetune/run_classifier.py \
    --model_type unilm --model_name_or_path $CKPT_PATH --task_name $TASK_NAME \
    --data_dir $DATASET_PATH --cached_train_file $TRAIN_CACHE --cached_dev_file $DEV_CACHE \
    --config_name $CONFIG_FILE --tokenizer_name unilm3-base-cased \
    --do_train --evaluate_during_training --logging_steps 100 --output_dir $OUTPUT_PATH --max_grad_norm 0 \
    --max_seq_length $MAX_LEN --per_gpu_train_batch_size $BSZ --learning_rate $LR \
    --num_train_epochs $EPOCH --weight_decay $WD --warmup_ratio $WM \
    --fp16_init_loss_scale 128.0 --adam_epsilon 1e-6 --adam_betas "0.9,0.98" \
    --dropout_prob 0.05 --cls_dropout_prob 0.05 \
    --fp16 --fp16_opt_level O2 --seed 1234
 ```

#### some results:

```
83.394 79.422 81.227 81.227 80.144 80.144 79.061 84.838
```

### STS-B

#### Finetuning

 ```shell
 # Set path to read training/dev dataset that you save in last step
export DATASET_PATH=/path/to/read/glue/task/data/            # Example: "/path/to/downloaded-glue-data-dir/rte/"

# Set path to save the finetuned model and result score
export OUTPUT_PATH=/path/to/save/result_of_finetuning

export TASK_NAME=rte

# Set path to the model checkpoint you need to test 
export CKPT_PATH=/path/to/your/model/checkpoint

# Set config file
export CONFIG_FILE=./unilm/configs/unilm3-base-cased.json

# Set max sequence length
export MAX_LEN=256

# Set path to cache train & dev features (tokenized, only use for this tokenizer!)
export TRAIN_CACHE=${DATASET_PATH}/$TASK_NAME.UNILM3_base_cased.$MAX_LEN.cache
export DEV_CACHE=${DATASET_PATH}/$TASK_NAME.UNILM3_base_cased.$MAX_LEN.cache

export PYTORCH_PRETRAINED_BERT_CACHE=/path/to/bert-cased-pretrained-cache/

# Setting the hyperparameters for the run.
export BSZ=8
export LR=2e-5
export EPOCH=15
export WD=0.1
export WM=0.2
CUDA_VISIBLE_DEVICES=0 python nlu_finetune/run_classifier.py \
    --model_type unilm --model_name_or_path $CKPT_PATH --task_name $TASK_NAME \
    --data_dir $DATASET_PATH --cached_train_file $TRAIN_CACHE --cached_dev_file $DEV_CACHE \
    --config_name $CONFIG_FILE --tokenizer_name unilm3-base-cased \
    --do_train --evaluate_during_training --logging_steps 100 --output_dir $OUTPUT_PATH --max_grad_norm 0 \
    --max_seq_length $MAX_LEN --per_gpu_train_batch_size $BSZ --learning_rate $LR \
    --num_train_epochs $EPOCH --weight_decay $WD --warmup_ratio $WM \
    --fp16_init_loss_scale 128.0 --adam_epsilon 1e-6 --adam_betas "0.9,0.98" \
    --dropout_prob 0.05 --cls_dropout_prob 0.05 \
    --fp16 --fp16_opt_level O2 --seed 1234
 ```

#### some results:

```
90.880 90.991 90.948 91.143 91.013
```

## SQuAD 2.0 Fine-tuning 
[Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer/) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable. 

 ```shell
# Set path to read training/dev dataset that you save in last step
export DATASET_PATH=/path/to/read/squad2/task/data/

# Download the train & dev datset
mkdir -p $DATASET_PATH
# Train datset
export TRAIN_FILE=$DATASET_PATH/train-v2.0.json
wget -o $TRAIN_FILE https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
# Dev datset
export DEV_FILE=$DATASET_PATH/dev-v2.0.json
wget -o $DEV_FILE https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json

# Set path to save the finetuned model and result score
export OUTPUT_PATH=/path/to/save/result_of_finetuning

# Set path to the model checkpoint you need to test 
export CKPT_PATH=/path/to/your/model/checkpoint

# Set config file
export CONFIG_FILE=./unilm/configs/unilm3-base-cased.json

# Set path to cache train & dev features (tokenized, only use for this tokenizer!)
export TRAIN_CACHE=${TRAIN_FILE}_UNILM3_base_cased.384doc.cache
export DEV_CACHE=${DEV_FILE}_UNILM3_base_cased.384doc.cache

export PYTORCH_PRETRAINED_BERT_CACHE=/path/to/bert-cased-pretrained-cache/

# Setting the hyperparameters for the run.
export BSZ=32
export LR=3e-5
export EPOCH=3
CUDA_VISIBLE_DEVICES=0 python nlu_finetune/run_squad.py \
    --model_type unilm --model_name_or_path $CKPT_PATH \
    --config_name $CONFIG_FILE --tokenizer_name unilm3-base-cased \
    --train_file $TRAIN_FILE --predict_file $DEV_FILE \
    --cached_train_file $TRAIN_CACHE --cached_dev_file $DEV_CACHE \
    --do_train --do_eval --pro \
    --per_gpu_train_batch_size $BSZ --learning_rate $LR --num_train_epochs $EPOCH --gradient_accumulation_steps 1 \
    --max_seq_length 384 --doc_stride 128 --output_dir $OUTPUT_PATH \
    --version_2_with_negative --seed 1234 --max_grad_norm 0 \
    --config_name $CONFIG_PATH --weight_decay 0.1 --warmup_ratio 0.2 \
    --fp16_init_loss_scale 128.0 --adam_epsilon 1e-6 --adam_betas "0.9,0.98" \
    --fp16_opt_level O2 --fp16 

# The model can also be run in distributed manner as follows (2-GPUs):

# per_gpu_train_batch_size = train_batch_size / num_gpus = 32 / 2 = 16
export BSZ=16
export LR=3e-5
export EPOCH=3
CUDA_VISIBLE_DEVICES=0,1 --nproc_per_node=2 python nlu_finetune/run_squad.py \
    --model_type unilm --model_name_or_path $CKPT_PATH \
    --config_name $CONFIG_FILE --tokenizer_name unilm3-base-cased \
    --train_file $TRAIN_FILE --predict_file $DEV_FILE \
    --cached_train_file $TRAIN_CACHE --cached_dev_file $DEV_CACHE \
    --do_train --do_eval --pro \
    --per_gpu_train_batch_size $BSZ --learning_rate $LR --num_train_epochs $EPOCH --gradient_accumulation_steps 1 \
    --max_seq_length 384 --doc_stride 128 --output_dir $OUTPUT_PATH \
    --version_2_with_negative --seed 1234 --max_grad_norm 0 \
    --config_name $CONFIG_PATH --weight_decay 0.01 --warmup_ratio 0.1 \
    --fp16_init_loss_scale 128.0 --adam_epsilon 1e-6 --adam_betas "0.9,0.98" \
    --fp16_opt_level O2 --fp16
 ```
