#!/usr/bin/env bash

# bash xgiga_generate_zero_w_adapter_proj.sh mspm4_xgiga en zh mbartV2_iadapterfix_2stage_adapterV2_encProjFix_rouge_en_rightCkpt_valid8 None hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/xgiga/ZhEnFr_infillNoise_r0.35_len256_right/checkpoints/mbartV2_cc100_adapterV2_encProjFix_trainLA_ls0 enzhfr_final_ln.json 2_24000 --prefix-tokens zh_CN


DATAVER="$1"
TRAIN_LG="$2"
TEST_LG="$3"
NAME="$4"
EXTEND_DICT="$5"
hdfs_adapter="$6"
ADAPTER_CONFIG_NAME="$7"
ckpt_suffix="$8"

argslist=""
for (( i = 9; i <= $# ; i++ ))
  do
    j=${!i}
    argslist="${argslist} $j "
  done
echo $argslist >&2

cd "$(dirname $0)" || return

echo "Install fairseq" >&2

sudo mkdir -p /usr/lib/python3.7/site-packages/
sudo pip3 install -e fairseq
pip3 install -r requirements.txt -i http://pypi.byted.org/simple/ --trusted-host=pypi.byted.org

# sudo apt-get update
# sudo apt-get install libxml-perl libxml-dom-perl

export PYROUGE_HOME_DIR=$(pwd)/RELEASE-1.5.5
export PYROUGE_TEMP_PATH=/opt/tiger

pyrouge_set_rouge_path $PYROUGE_HOME_DIR
chmod +x $PYROUGE_HOME_DIR/ROUGE-1.5.5.pl

prefix=hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0

dataset_path=${prefix}/Datasets/multilingual/data-bin/${DATAVER}/${TEST_LG}
tensorboard_logdir=${prefix}/Workspace/Multilingual/xgiga/${TEST_LG}/logs/${NAME}_${TRAIN_LG}
new_checkpoint_path=${prefix}/Workspace/Multilingual/xgiga/${TRAIN_LG}/checkpoints/${NAME}
pretrained_path=${prefix}/Workspace/Multilingual/pretrained

echo "new_checkpoint_path " $new_checkpoint_path

# change checkpoint path
hdfs dfs -mkdir -p $tensorboard_logdir

local_root=~/xgiga_${NAME}_${TEST_LG}
resource_root=${local_root}/resource
output_path=${local_root}/output
model_path=${local_root}/model
adapter_path=${local_root}/adapter
component_path=~/component
mkdir -p ${resource_root}
mkdir -p ${output_path}
mkdir -p ${model_path}
mkdir -p ${adapter_path}
mkdir -p ${component_path}

local_dataset_path=${resource_root}/dataset
mkdir -p ${local_dataset_path}
hadoop fs -copyToLocal ${dataset_path}/* ${local_dataset_path}
echo "Download resource from ${dataset_path} to ${local_dataset_path}" >&2

local_tensorboard_path=${output_path}/tensorboard_logdir
# hadoop fs -copyToLocal ${tensorboard_logdir} ${local_tensorboard_path}
mkdir -p ${local_tensorboard_path}

local_checkpoint_path=${output_path}/checkpoint_path
mkdir -p ${local_checkpoint_path}

hadoop fs -copyToLocal ${new_checkpoint_path}/checkpoint_${ckpt_suffix}.pt ${local_checkpoint_path}
echo "Load checkpoints from ${new_checkpoint_path}/checkpoint_${ckpt_suffix}.pt to ${local_checkpoint_path}" >&2

hdfs dfs -get hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/AdapterConfig/${ADAPTER_CONFIG_NAME} ${model_path}/
hdfs dfs -get hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Datasets/cc100/components/* ${component_path}/
hdfs dfs -get hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/ComponentConfig/baseConfig.json ${model_path}/component_config.json

# download adapter
hdfs dfs -get ${hdfs_adapter}/*_best.pt ${adapter_path}/

echo "remove ${local_dataset_path}/dict.*.txt"
rm ${local_dataset_path}/dict.*.txt
hdfs dfs -get ${pretrained_path}/dict_extend.txt ${local_dataset_path}
echo "treat ${local_dataset_path}/dict_extend.txt as ${local_dataset_path}/dict.*.txt"
cp ${local_dataset_path}/dict_extend.txt ${local_dataset_path}/dict.doc.txt
cp ${local_dataset_path}/dict_extend.txt ${local_dataset_path}/dict.sum.txt

if [ ! "${EXTEND_DICT}" == "None" ]; then
    echo "Load extend dictionary from ${pretrained_path}/${EXTEND_DICT}.txt to ${local_dataset_path}" >&2
    hdfs dfs -get ${pretrained_path}/${EXTEND_DICT}.txt ${local_dataset_path}
    echo "write the extended dictionary into ${local_dataset_path}/dict.*.txt"
    cat ${local_dataset_path}/${EXTEND_DICT}.txt >> ${local_dataset_path}/dict.doc.txt
    cat ${local_dataset_path}/${EXTEND_DICT}.txt >> ${local_dataset_path}/dict.sum.txt
    if [ ! -e ${local_dataset_path}/${EXTEND_DICT}.txt ]; then
        echo "[ERROR] Load extend dictionary ${EXTEND_DICT}.txt failed!" >&2
    fi
fi

echo "Finish download files" >&2

langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN

echo "Generating..."

suffix=$(echo "$argslist" | sed -e "s/-//g"  -e "s/  */_/g")

python3 fairseq/generate.py ${local_dataset_path}  \
  --path ${local_checkpoint_path}/checkpoint_${ckpt_suffix}.pt \
  --task summarization_from_pretrained_mbart_mspm4 \
  --gen-subset test \
  --source-lang doc --target-lang sum \
  --langs $langs \
  --remove-bpe 'sentencepiece'  \
  --max-len-a 0.78 --max-len-b 2 --min-len 2 \
  --lenpen 0.6 \
  --no-repeat-ngram-size 3 \
  --truncate-source \
  --user-dir examples/summarization \
  --batch-size 8 \
  --model-overrides "{'iadapter_config': '${model_path}/${ADAPTER_CONFIG_NAME}', 'trained_iadapter_dir': '${adapter_path}', 'component_config': '${model_path}/component_config.json'}" \
  $argslist \
  > ${local_tensorboard_path}/"output$suffix"

cat ${local_tensorboard_path}/"output$suffix" | grep -P "^H" | \
  sort -V |cut -f 3- | sed -e "s/\[[a-z]\{2\}_[A-Z]\{2\}\]//g" | sed -e "s/\[[a-z]\{,10\}\] //g" > ${local_tensorboard_path}/"test$suffix.hypo"

echo "Load ground truth file from ${prefix}/Datasets/multilingual/xgiga/raw/test.y.${TEST_LG}"
hadoop fs -get ${prefix}/Datasets/multilingual/xgiga/raw/test.y.${TEST_LG} ${local_dataset_path}/

hypo=${local_tensorboard_path}/"test$suffix.hypo"
ref=${local_dataset_path}/test.y.${TEST_LG}

if [ ${TEST_LG} == "zh" ]; then
  # split the reference and hypothesis into chars
  cat ${hypo} | python3 -u ./xnlg/zh_split_words.py > ${local_tensorboard_path}/"test$suffix.hypo.char"
  cat ${ref} | python3 -u ./xnlg/zh_split_words.py > ${local_dataset_path}/test.y.${TEST_LG}.char
  hypo=${local_tensorboard_path}/"test$suffix.hypo.char"
  ref=${local_dataset_path}/test.y.${TEST_LG}.char
  python3 ./xnlg/calc_rouge.py --ref ${ref} --hyp ${hypo} --zh True
else
  python3 ./xnlg/calc_rouge.py --ref ${ref} --hyp ${hypo}
fi

echo "Put ${local_tensorboard_path} to ${tensorboard_logdir}" >&2
hadoop fs -put -f ${local_tensorboard_path}/* ${tensorboard_logdir}/
sleep 120
