#!/usr/bin/env bash

# bash suffix_xgiga_dump_embedding_w_text.sh mspm4_xgiga en zh_CN finetune_pretrainedMspm4_freezeDecoder /opt/tiger/sumtest/tmpdata/$split.x.en_zh2en test_x_zh 1_4000 true

wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1kZJ0YyvW2vjCJv2vqEoemww2s5ECK7XJ' -O xqg

DATAVER="$1"
TRAIN_LG="$2"
TEST_LG_TAG="$3"
NAME="$4"
INPATH="$5"
EMBDIR="$6"
ckpt_suffix="$7"
skip_preprocess="$8"
hdfs_adapter="$9"
ADAPTER_CONFIG_NAME="enzhfr_final_ln.json"

argslist=""
for (( i = 10; i <= $# ; i++ ))
  do
    j=${!i}
    argslist="${argslist} $j "
  done
echo $argslist >&2

tokenize(){   
    base_dir=$(pwd)

    INPUT=$1
    OUTPUT=$2
    DICT=$3

    echo "tokenize ${INPUT} to ${OUTPUT} using mbart's spm..."
    # setup MBART
    MBART=/home/tiger/mbart.cc25
    if [ ! -d $MBART ]; then
        hdfs_get hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/pretrained/mbart.CC25.tar.gz /home/tiger
        tar -xvzf /home/tiger/mbart.CC25.tar.gz -C /home/tiger
    fi
    MODEL=$MBART/sentence.bpe.model
    python3 ${base_dir}/fairseq/scripts/spm_encode.py --model=$MODEL < $INPUT > $OUTPUT

    echo "first line of ${OUTPUT}"
    head -n 1 $OUTPUT
}

make_file(){
  dir=$1
  if [ ! -e $dir ]; then
    mkdir -p $dir
  fi
}

hdfs_get(){
  source=$1
  target=$2
  if [ ! -e $target ]; then
    hdfs dfs -get $source $target
  fi
}

cd "$(dirname $0)" || return

local_root=~/xgiga_dumpEmb_${NAME}
resource_root=${local_root}/resource
output_path=${local_root}/output
model_path=${local_root}/model
local_embedding_path=${local_root}/embedding/${EMBDIR}
local_checkpoint_path=${output_path}/checkpoint_path
adapter_path=${local_root}/adapter
component_path=~/component

make_file ${local_checkpoint_path}
make_file ${resource_root}
make_file ${output_path}
make_file ${model_path}
make_file ${local_embedding_path}
make_file ${adapter_path}
make_file ${component_path}

local_dataset_path=${resource_root}/dataset
mkdir -p ${local_dataset_path}

prefix=hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0

if [ "$skip_preprocess" != "true" ]; then
    echo "Install fairseq" >&2
    sudo mkdir -p /usr/lib/python3.7/site-packages/
sudo pip3 install -e fairseq
    pip3 install -r requirements.txt -i http://pypi.byted.org/simple/ --trusted-host=pypi.byted.org

    export PYROUGE_HOME_DIR=$(pwd)/RELEASE-1.5.5
    export PYROUGE_TEMP_PATH=/opt/tiger

    pyrouge_set_rouge_path $PYROUGE_HOME_DIR
    chmod +x $PYROUGE_HOME_DIR/ROUGE-1.5.5.pl
    dataset_path=${prefix}/Datasets/multilingual/data-bin/${DATAVER}/${TRAIN_LG}
    pretrained_path=${prefix}/Workspace/Multilingual/pretrained

    hdfs dfs -get ${dataset_path}/* ${local_dataset_path}
    echo "Download resource from ${dataset_path} to ${local_dataset_path}" >&2

    hdfs_get hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/AdapterConfig/${ADAPTER_CONFIG_NAME} ${model_path}/${ADAPTER_CONFIG_NAME}
    hdfs dfs -get hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Datasets/cc100/components/* ${component_path}/
    hdfs_get hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/ComponentConfig/layer10.json ${model_path}/component_config.json

    # download adapter
    hdfs dfs -get ${hdfs_adapter}/*_best.pt ${adapter_path}/

    echo "remove ${local_dataset_path}/dict.*.txt"
    rm ${local_dataset_path}/dict.*.txt
    hdfs dfs -get ${pretrained_path}/dict_extend.txt ${local_dataset_path}
    echo "treat ${local_dataset_path}/dict_extend.txt as ${local_dataset_path}/dict.*.txt"
    cp ${local_dataset_path}/dict_extend.txt ${local_dataset_path}/dict.doc.txt
    cp ${local_dataset_path}/dict_extend.txt ${local_dataset_path}/dict.sum.txt

    # # tokenize raw text
    tokenize $INPATH $INPATH.spm ${local_dataset_path}/dict.doc.txt
    cat $INPATH.spm | sed -e "s/$/ ./" > $INPATH.spm.special
fi

checkpoint_path=${prefix}/Workspace/Multilingual/xgiga/${TRAIN_LG}/checkpoints/${NAME}
hdfs_get ${checkpoint_path}/checkpoint_${ckpt_suffix}.pt ${local_checkpoint_path}/checkpoint_${ckpt_suffix}.pt
echo "Load checkpoints from ${checkpoint_path}/checkpoint_${ckpt_suffix}.pt to ${local_checkpoint_path}" >&2
echo "Finish download files" >&2

langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN

echo "Start dumping embedding..."
suffix=$(echo "$argslist" | sed -e "s/-//g"  -e "s/  */_/g")

python3 fairseq/interactive_dump_embedding.py ${local_dataset_path}  \
    --path ${local_checkpoint_path}/checkpoint_${ckpt_suffix}.pt \
    --task summarization_from_pretrained_mbart_mspm4 \
    --gen-subset test \
    --source-lang doc --target-lang sum \
    --langs $langs \
    --results-path ${local_embedding_path} \
    --remove-bpe 'sentencepiece'  \
    --truncate-source \
    --prefix-tokens ${TEST_LG_TAG} \
    --user-dir examples/summarization \
    --batch-size 2 $argslist \
    --model-overrides "{'iadapter_config': '${model_path}/${ADAPTER_CONFIG_NAME}', 'trained_iadapter_dir': '${adapter_path}', 'component_config': '${model_path}/component_config.json'}" \
    --input $INPATH.spm.special
 
# echo "Put ${local_embedding_path} to ${embedding_dir}" >&2
# hadoop fs -put -f ${local_embedding_path}/* ${embedding_dir}/
# sleep 600
