rootdir="" # to be filled

drive_dir=/mnt/docker-volume-byte-drive/volumes/5981492_lab-wxz

cc100_emb_path=${drive_dir}/embedding/xgiga_dumpEmb_encoder
working_dir=${rootdir}/crossLingual
component_dir=${rootdir}/component
graph_dir=${rootdir}/graphs
if [ ! -d "${component_dir}"  ] ; then
    mkdir -p "${component_dir}"
fi

if [ ! -d "${graph_dir}"  ] ; then
    mkdir -p "${graph_dir}"
fi

declare -A LANG_TAGS
LANG_TAGS=([en]="en_XX" [fr]="fr_XX" [zh]="zh_CN")

cd ${working_dir}

un_dir=${drive_dir}/clt/UN/dev/

# dump component
cd utils/
for lg in en zh fr
do
    python3 normalizeEmb.py -i ${cc100_emb_path}/embedding/cc100_first1w_${lg}/document_embedding.jsonl \
        -o ${component_dir}/${lg}_component.npy -m extractComponentMode -r 100
done
cd ..
echo "Dumping components is finished"


# Encoding UN dataset
for lg in en zh fr
do
    TAG=${LANG_TAGS[$lg]}
    bash dump_text_emb_mbart.sh mspm4_xgiga ${TAG} ${un_dir}/UNv1.0.6way.${lg}.first_5000.txt UN_${lg} false "{'arch': 'mbart_summ_abs_large', 'doc_state': 'encoder', 'encoder_version': 'v2'}"
done
echo "Encoding UN dataset is finished"

# projection
for lg in en zh fr
do
    for r in 1 2 3 4 10 20
    do
        python3 normalizeEmb.py -m projNormMode -r $r --component ${component_dir}/${lg}_component.npy -i ${rootdir}/xgiga_dumpEmb_encoder/embedding/UN_${lg}/document_embedding.jsonl -o ${rootdir}/xgiga_dumpEmb_encoder/embedding/UN_${lg}/proj_output_r${r}.json
    done
done
echo "Projection is finished"

# visualize projection results
for r in 1 2 3 4 10 20
do
    python3 visualizeEmb.py -m visualizeDocEmbPair --system-files ${rootdir}/xgiga_dumpEmb_encoder/embedding/UN_en/proj_output_r${r}.json ${rootdir}/xgiga_dumpEmb_encoder/embedding/UN_fr/proj_output_r${r}.json -n proj_r${r}_enfr -o ${graph_dir}

    python3 visualizeEmb.py -m visualizeDocEmbPair --system-files ${rootdir}/xgiga_dumpEmb_encoder/embedding/UN_en/proj_output_r${r}.json ${rootdir}/xgiga_dumpEmb_encoder/embedding/UN_zh/proj_output_r${r}.json -n proj_r${r}_enzh -o ${graph_dir}
done
echo "Visualization is finished"

# Calculate retrieval accuracy
for tlg in zh fr
do
    echo "Calculating retrieval accuracy for en-${tlg}"
    for r in 1 2 3 4 10 20
    do
        python3 visualizeEmb.py -m retrievalGivenArray  --system-files ${rootdir}/xgiga_dumpEmb_encoder/embedding/UN_en/proj_output_r${r}.json ${rootdir}/xgiga_dumpEmb_encoder/embedding/UN_${tlg}/proj_output_r${r}.json
    done
done
echo "Calculating retrieval accuracy is finished"