#!/usr/bin/env bash

# bash wikilingua_dump_embedding.sh mspm4 english ext_finetune chinese

DATAVER="$1"
TRAIN_LG="$2"
NAME="$3"
TEST_LG="$4"

argslist=""
for (( i = 5; i <= $# ; i++ ))
  do
    j=${!i}
    argslist="${argslist} $j "
  done
echo $argslist >&2

cd "$(dirname $0)" || return

echo "Install fairseq" >&2

sudo mkdir -p /usr/lib/python3.7/site-packages/
sudo pip3 install -e fairseq
pip3 install -r requirements.txt -i http://pypi.byted.org/simple/ --trusted-host=pypi.byted.org

prefix=hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0

dataset_path=${prefix}/Datasets/multilingual/wikilingua/data-bin/${DATAVER}/${TEST_LG}
checkpoint_path=${prefix}/Workspace/Multilingual/wikilingua/${TRAIN_LG}/checkpoints/${NAME}
embedding_dir=${prefix}/Workspace/Multilingual/wikilingua/${TEST_LG}/embeddings/${NAME}_${TRAIN_LG}

# change checkpoint path
hdfs dfs -mkdir -p $embedding_dir

local_root=~/wikiLingua_dumpEmb_${NAME}_${TEST_LG}
resource_root=${local_root}/resource
output_path=${local_root}/output
model_path=${local_root}/model
local_embedding_path=${local_root}/embedding
mkdir -p ${resource_root}
mkdir -p ${output_path}
mkdir -p ${model_path}

local_dataset_path=${resource_root}/dataset
mkdir -p ${local_dataset_path}
hadoop fs -copyToLocal ${dataset_path}/* ${local_dataset_path}
echo "Download resource from ${dataset_path} to ${local_dataset_path}" >&2

local_checkpoint_path=${output_path}/checkpoint_path
mkdir -p ${local_checkpoint_path}
hadoop fs -copyToLocal ${checkpoint_path}/checkpoint_best.pt ${local_checkpoint_path}
echo "Load checkpoints from ${checkpoint_path}/checkpoint_best.pt to ${local_checkpoint_path}" >&2

echo "Finish download files" >&2

langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN

echo "Start dumping embedding..."

suffix=$(echo "$argslist" | sed -e "s/-//g"  -e "s/  */_/g")

python3 fairseq/dump_embedding.py ${local_dataset_path}  \
    --path ${local_checkpoint_path}/checkpoint_best.pt \
    --task summarization_from_pretrained_mbart_mspm4 \
    --gen-subset test \
    --source-lang doc --target-lang sum \
    --langs $langs \
    --results-path ${local_embedding_path} \
    --remove-bpe 'sentencepiece'  \
    --truncate-source \
    --user-dir examples/summarization \
    --batch-size 1 \
    $argslist

echo "Put ${local_embedding_path} to ${embedding_dir}" >&2
hadoop fs -put -f ${local_embedding_path}/* ${embedding_dir}/
# sleep 600
