#!/usr/bin/env bash

# bash dump_text_emb_mbart.sh mspm4_xgiga en_XX /mnt/bd/lab-wxz/clt/cc100/en.first1w.txt cc100_first1w_en false "{'arch': 'mbart_summ_abs_large', 'doc_state': 'encoder', 'encoder_version': 'v2'}"

DATAVER="$1"
TEST_LG_TAG="$2"
INPATH="$3"
EMBDIR="$4"
skip_preprocess="$5"
model_overrides="$6"

NAME="encoder"

argslist=""
for (( i = 7; i <= $# ; i++ ))
  do
    j=${!i}
    argslist="${argslist} $j "
  done
echo $argslist >&2

# setup MBART
MBART=/mnt/bd/lab-wxz/mbart.cc25.v2
# MBART=/home/tiger/mbart.cc25.v2

tokenize(){   
    base_dir=$(pwd)

    INPUT=$1
    OUTPUT=$2
    DICT=$3

    echo "tokenize ${INPUT} to ${OUTPUT} using mbart's spm..."
    if [ ! -d $MBART ]; then
        hadoop fs -copyToLocal hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/pretrained/mbart.cc25.v2.tar.gz /home/tiger
        tar -xvzf /home/tiger/mbart.cc25.v2.tar.gz -C /home/tiger
    fi
    MODEL=$MBART/sentence.bpe.model
    python3 ${base_dir}/fairseq/scripts/spm_encode.py --model=$MODEL < $INPUT > $OUTPUT

    echo "first line of ${OUTPUT}"
    head -n 1 $OUTPUT
}

make_file(){
  dir=$1
  if [ ! -e $dir ]; then
    mkdir -p $dir
  fi
}

hdfs_get(){
  source=$1
  target=$2
  if [ ! -e $target ]; then
    hdfs dfs -get $source $target
  fi
}

cd "$(dirname $0)" || return

local_root=~/xgiga_dumpEmb_${NAME}
resource_root=${local_root}/resource
output_path=${local_root}/output
# model_path=${local_root}/model
local_embedding_path=${local_root}/embedding/${EMBDIR}
local_checkpoint_path=${output_path}/checkpoint_path
component_path=~/component

make_file ${local_checkpoint_path}
make_file ${resource_root}
make_file ${output_path}
# make_file ${model_path}
make_file ${local_embedding_path}
make_file ${component_path}

local_dataset_path=${resource_root}/dataset
mkdir -p ${local_dataset_path}

prefix=hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0

if [ "$skip_preprocess" != "true" ]; then
    echo "Install fairseq" >&2
    sudo mkdir -p /usr/lib/python3.7/site-packages/
    sudo pip3 install -e fairseq
    bash install_pyrouge.sh

    export PYROUGE_HOME_DIR=$(pwd)/RELEASE-1.5.5
    export PYROUGE_TEMP_PATH=/opt/tiger

    pyrouge_set_rouge_path $PYROUGE_HOME_DIR
    chmod +x $PYROUGE_HOME_DIR/ROUGE-1.5.5.pl
    dataset_path=${prefix}/Datasets/multilingual/data-bin/${DATAVER}/en
    pretrained_path=${prefix}/Workspace/Multilingual/pretrained

    hdfs dfs -get ${dataset_path}/* ${local_dataset_path}
    echo "Download resource from ${dataset_path} to ${local_dataset_path}" >&2

    echo "remove ${local_dataset_path}/dict.*.txt"
    rm ${local_dataset_path}/dict.*.txt
    hdfs dfs -get ${pretrained_path}/dict_extend.txt ${local_dataset_path}
    echo "treat ${local_dataset_path}/dict_extend.txt as ${local_dataset_path}/dict.*.txt"
    cp ${local_dataset_path}/dict_extend.txt ${local_dataset_path}/dict.doc.txt
    cp ${local_dataset_path}/dict_extend.txt ${local_dataset_path}/dict.sum.txt
fi

# # tokenize raw text
tokenize $INPATH $INPATH.spm ${local_dataset_path}/dict.doc.txt
cat $INPATH.spm | sed -e "s/$/ ./" | sed -e "s/^/[${TEST_LG_TAG}] /" > $INPATH.spm.special
echo "Finish download files" >&2

langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN

echo "Start dumping embedding..."
suffix=$(echo "$argslist" | sed -e "s/-//g"  -e "s/  */_/g")

python3 fairseq/interactive_dump_embedding.py ${local_dataset_path}  \
    --path ${MBART}/model.pt \
    --task summarization_from_pretrained_mbart_mspm4 \
    --gen-subset test \
    --source-lang doc --target-lang sum \
    --langs $langs \
    --results-path ${local_embedding_path} \
    --remove-bpe 'sentencepiece'  \
    --truncate-source \
    --doc-lang ${TEST_LG_TAG} \
    --prefix-tokens ${TEST_LG_TAG} \
    --user-dir examples/summarization \
    --max-sentences 8 $argslist \
    --model-overrides "${model_overrides}" \
    --input $INPATH.spm.special \
    --skip-invalid-size-inputs-valid-test
