#!/usr/bin/env bash

# bash mlgsum_proj_adapter.sh train_2stage en en mspm4 debug layer12 hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/xgiga/ZhEnFr_Unsup_infillNoise_r0.35_maxlen256_right_fix/checkpoints/mbartV2_cc100_adapterV2_encProjFix_trainLA_ls0 enzhfr_final_ln.json --encoder-version v1 --adapter-num-layer 2 --batch-size 8 --ln-after-proj;

# bash xgiga_proj_adapter.sh generate en zh mspm4 debug layer10 hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/xgiga/ZhEnFr_Unsup_infillNoise_r0.35_maxlen256_right_fix/checkpoints/mbartV2_cc100_adapterV2_encProjFix_trainLA_ls0 enzhfr_final_ln.json _best

MODE="$1"
LG="$2"
EVAL_LG="$3" # 'all' or single language
DATAVER="$4"
NAME="$5"
component_name="$6" # "layer10" or "wiki40b_10w"
hdfs_adapter="$7"
ADAPTER_CONFIG_NAME="$8"

declare -i args_start=9
if [ $MODE == "generate" ]; then
    args_start=$((${args_start}+1))
fi

argslist=""
for (( i = ${args_start}; i <= $# ; i++ ))
    do
        j=${!i}
        argslist="${argslist} $j "
    done
echo $argslist >&2

# define
component="baseConfig.json"
if [ "$component_name" == "layer10" ]; then
    component="layer10.json"
elif [ "$component_name" == "wiki40b_10w" ]; then
    component="wiki40b_10w.json"
fi

predicted_lgs=(${EVAL_LG})
if [ ${EVAL_LG} == 'all' ]; then
    predicted_lgs=(en zh fr)
fi

# check
echo "component: " $component
predicted_lg_str=""
for lg in ${predicted_lgs[*]}
do
    predicted_lg_str="$predicted_lg_str $lg"
done
echo "predicted_lg_str: " ${predicted_lg_str}

renameSplit(){
    path=$1
    oldsplit=$2
    newsplit=$3

    mv ${path}/${oldsplit}.doc-sum.doc.bin ${path}/${newsplit}.doc-sum.doc.bin
    mv ${path}/${oldsplit}.doc-sum.doc.idx ${path}/${newsplit}.doc-sum.doc.idx
    mv ${path}/${oldsplit}.doc-sum.sum.bin ${path}/${newsplit}.doc-sum.sum.bin
    mv ${path}/${oldsplit}.doc-sum.sum.idx ${path}/${newsplit}.doc-sum.sum.idx

    echo "rename ${path}/${oldsplit}.* to ${path}/${newsplit}.*"
}

cd "$(dirname $0)" || return
echo "Install fairseq" >&2

# export https_proxy=http://bj-rd-proxy.byted.org:3128 http_proxy=http://bj-rd-proxy.byted.org:3128 no_proxy=code.byted.org

# sudo mkdir -p /usr/lib/python3.7/site-packages/
sudo pip3 install -e fairseq
# sudo pip3 install -r requirements.txt -i http://pypi.byted.org/simple/ --trusted-host=pypi.byted.org
# sudo pip3 install -r pyrouge_requirements.txt -i http://pypi.byted.org/simple/ --trusted-host=pypi.byted.org
# sudo pip3 install -r pyrouge_requirements.txt -i http://pypi.byted.org/simple/ --trusted-host=pypi.byted.org

export PYROUGE_HOME_DIR=$(pwd)/RELEASE-1.5.5
export PYROUGE_TEMP_PATH=/opt/tiger

pyrouge_set_rouge_path $PYROUGE_HOME_DIR
chmod +x $PYROUGE_HOME_DIR/ROUGE-1.5.5.pl

prefix=hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0
dataset_path=${prefix}/Datasets/multilingual/data-bin/${DATAVER}/${LG}
tensorboard_logdir=${prefix}/Workspace/Multilingual/mlgsum/${LG}/logs/${NAME}
checkpoint_path=${prefix}/Workspace/Multilingual/mlgsum/${LG}/checkpoints/${NAME}
pretrained_path=${prefix}/Workspace/Multilingual/pretrained

# change checkpoint path
hdfs dfs -mkdir -p $tensorboard_logdir
hdfs dfs -mkdir -p $checkpoint_path

local_root=~/mlgsum_${NAME}
resource_root=${local_root}/resource
output_path=${local_root}/output
model_path=${local_root}/model
adapter_path=${local_root}/adapter
component_path=~/component
mkdir -p ${resource_root}
mkdir -p ${output_path}
mkdir -p ${model_path}
mkdir -p ${adapter_path}
mkdir -p ${component_path}

local_dataset_path=${resource_root}/dataset
mkdir -p ${local_dataset_path}
# hadoop fs -copyToLocal ${dataset_path}/* ${local_dataset_path}
# echo "Download resource from ${dataset_path} to ${local_dataset_path}" >&2

# renameSplit ${local_dataset_path} "valid" "valid_$LG"
# renameSplit ${local_dataset_path} "test" "test_$LG"

# for lg in ${predicted_lgs[*]}
# do
#     if [ ! -e ${local_dataset_path}}/valid_${lg}.doc-sum.doc.bin ]; then
#         echo "Download resource from ${prefix}/Datasets/multilingual/data-bin/${DATAVER}/$lg/valid.* to ${local_dataset_path}/" >&2
#         hdfs dfs -get ${prefix}/Datasets/multilingual/data-bin/${DATAVER}/$lg/valid.* ${local_dataset_path}/
#         renameSplit ${local_dataset_path} "valid" "valid_$lg"
#     fi
# done

local_tensorboard_path=${output_path}/tensorboard_logdir
if [ -p ${local_tensorboard_path} ]; then
    rm -r ${local_tensorboard_path}
fi
mkdir -p ${local_tensorboard_path}

local_checkpoint_path=${output_path}/checkpoint_path
mkdir -p ${local_checkpoint_path}

# local_pretrained_path=${model_path}/mbart.cc25.v2
# if [ ! -d ${local_pretrained_path} ]; then
#     echo "Load pretrained model from ${pretrained_path}/mbart.cc25.v2.tar.gz to ${local_pretrained_path}" >&2
#     hadoop fs -copyToLocal ${pretrained_path}/mbart.cc25.v2.tar.gz ${model_path}
#     tar -xvzf ${model_path}/mbart.cc25.v2.tar.gz -C ${model_path}
# else
#     echo "Pretrained model in ${local_pretrained_path}" >&2
# fi

# hdfs dfs -get hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/AdapterConfig/${ADAPTER_CONFIG_NAME} ${model_path}/
# hdfs dfs -get hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/ComponentConfig/$component ${model_path}/component_config.json
# hdfs dfs -get hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Datasets/cc100/components/* ${component_path}/

# # download adapter
# hdfs dfs -get ${hdfs_adapter}/*_best.pt ${adapter_path}/

# # use default vocabulary to load pretrained mBART
# echo "remove ${local_dataset_path}/dict.*.txt"
# rm ${local_dataset_path}/dict.*.txt
# echo "treat ${local_pretrained_path}/dict.txt as ${local_dataset_path}/dict.*.txt"
# cp ${local_pretrained_path}/dict.txt ${local_dataset_path}/dict.doc.txt
# cp ${local_pretrained_path}/dict.txt ${local_dataset_path}/dict.sum.txt

echo "Finish download files" >&2

langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN

# if [ ! -e ${local_dataset_path}}/test_${EVAL_LG}.doc-sum.doc.bin ]; then
#     echo "Download resource from ${prefix}/Datasets/multilingual/data-bin/${DATAVER}/${EVAL_LG}/test.* to ${local_dataset_path}/" >&2
#     hdfs dfs -get ${prefix}/Datasets/multilingual/data-bin/${DATAVER}/${EVAL_LG}/test.* ${local_dataset_path}/
#     renameSplit ${local_dataset_path} "test" "test_${EVAL_LG}"
#     echo "Download resource from ${prefix}/Datasets/multilingual/data-bin/${DATAVER}/${EVAL_LG}/dict.* to ${local_dataset_path}/" >&2
#     hdfs dfs -get ${prefix}/Datasets/multilingual/data-bin/${DATAVER}/${EVAL_LG}/dict.* ${local_dataset_path}/
# fi

echo "Generating..."

suffix=$(echo "$argslist" | sed -e "s/-//g"  -e "s/  */_/g")

hdfs dfs -get ${tensorboard_logdir}/* ${local_tensorboard_path}/

echo "Load ground truth file from ${prefix}/Datasets/multilingual/clean0712/${EVAL_LG}/test.${EVAL_LG}.sum"
hadoop fs -get ${prefix}/Datasets/multilingual/clean0702/${EVAL_LG}/test.${EVAL_LG}.sum ${local_dataset_path}
hadoop fs -get ${prefix}/Datasets/multilingual/clean0712/${EVAL_LG}/test.${EVAL_LG}.sum ${local_dataset_path}

hypo=${local_tensorboard_path}/"test$suffix.hypo"
ref=${local_dataset_path}/test.${EVAL_LG}.sum

cat ${local_tensorboard_path}/"output$suffix" | grep -P "^H" | \
sort -V |cut -f 3- | sed -e "s/\[[a-z]\{2\}_[A-Z]\{2\}\]//g" | sed -e "s/\[[a-z]\{,10\}\] //g" > ${local_tensorboard_path}/"test$suffix.hypo"

if [ ${EVAL_LG} == "zh" ]; then
# split the reference and hypothesis into chars
    cat ${hypo} | python3 -u ./xnlg/zh_split_words.py > ${local_tensorboard_path}/"test$suffix.hypo.char"
    cat ${ref} | python3 -u ./xnlg/zh_split_words.py > ${local_dataset_path}/test.${EVAL_LG}.sum.char
    hypo=${local_tensorboard_path}/"test$suffix.hypo.char"
    ref=${local_dataset_path}/test.${EVAL_LG}.sum.char
    python3 ./xnlg/calc_rouge.py --ref ${ref} --hyp ${hypo} --zh True
else
    python3 ./xnlg/calc_rouge.py --ref ${ref} --hyp ${hypo}
fi