DATA_NAME="wikihow"
TASK_NAME="wikihow"
MODEL_TYPE="vilbert"
VISION_MODEL="detectron2"


python3 -m trainers.train_pairwise_head \
  --model_name_or_path "pretrained_models/vilbert/roberta/large" \
  --do_not_load_optimizer \
  --do_train \
  --do_eval \
  --evaluate_during_training \
  --per_gpu_train_batch_size 4 \
  --per_gpu_eval_batch_size 1 \
  --learning_rate 2e-6 \
  --num_train_epochs 10.0 \
  --max_seq_length 300 \
  --data_dir "data/${DATA_NAME}" \
  --output_dir "exp_outputs/paper_results/finetune/${DATA_NAME}/${TASK_NAME}_berson_multimodal/${MODEL_TYPE}_${VISION_MODEL}_fullimg_reg3_seqlen60_berson" \
  --task_name "${TASK_NAME}_hl_v1" \
  --order_criteria "loose" \
  --overwrite_output_dir \
  --save_steps 5000 \
  --logging_steps 500 \
  --multimodal_model_type ${MODEL_TYPE} \
  --per_seq_max_length 60 \
  --max_eval_steps 1000 \
  --iters_to_eval 30000 40000 \
  --warmup_steps 1000 \
  --wrapper_model_type "berson" \
  --eval_splits "test-human_annot_only" \
  --train_split "train-human_annot" \
  --multimodal \
  --vilbert_paired_coattention \
  --vision_model "${VISION_MODEL}_COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml" \
  --vision_feature_dim 1024 \
  --include_full_img_features 1 \
  --include_num_img_regional_features 3 \
