#!/usr/bin/env bash
# set -e

# bash predata.sh

cd /opt/tiger/sumtest/crossLingualTransfer

xgiga_dir="/mnt/bd/lab-wxz/clt/xgiga"
cc100_dir="/mnt/bd/lab-wxz/clt/cc100"
out_dir="/mnt/bd/lab-wxz/clt/langReverse"

PARTS=(train dev test)
echo "Parts are : ${PARTS[*]}"

cc100_PARTS=(train dev)
echo "cc100 Parts are: ${cc100_PARTS[*]}"

TOKEN="${out_dir}/$MODE"
if [ ! -d "$TOKEN"  ] ; then
    mkdir -p "$TOKEN"
fi

tokenize(){   
    # base_dir=$(pwd)

    INPUT=$1
    OUTPUT=$2

    echo "tokenize ${INPUT} to ${OUTPUT} using mbart's spm..."
    MBART=/home/tiger/mbart.cc25.v2
    if [ ! -d $MBART ]; then
        hdfs_get hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/pretrained/mbart.cc25.v2.tar.gz /home/tiger
        tar -xvzf /home/tiger/mbart.cc25.v2.tar.gz -C /home/tiger
    fi
    DICT=$MBART/dict_extend_0716.txt
    echo $DICT
    if [ ! -f $DICT ]; then
      hdfs dfs -get hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Workspace/Multilingual/pretrained/dict_extend_0716.txt $MBART
    fi
    MODEL=$MBART/sentence.bpe.model
    python3 fairseq/scripts/spm_encode.py --model=$MODEL < $INPUT > $OUTPUT

    echo "first line of ${OUTPUT}"
    head -n 1 $OUTPUT
}

binary(){
  INPUT=$1
  BINDIR=$2
  LG=$3
  
  MBART=/home/tiger/mbart.cc25.v2
  DICT=$MBART/dict_extend_0716.txt
  TOKENTYPE="spm"
  
  echo "Binarized $INPUT ($TOKENTYPE) to $BINDIR with dict $DICT"

  python3 fairseq/preprocess.py \
    --source-lang doc \
    --target-lang sum \
    --trainpref "$INPUT/train.$LG.$TOKENTYPE" \
    --validpref "$INPUT/dev.$LG.$TOKENTYPE" \
    --destdir "$BINDIR" \
    --srcdict "$DICT" \
    --tgtdict "$DICT" \
    --workers 70
}

# mspm encoding for xgiga en
TOKEN=${out_dir}/MSPM
# if [ ! -d ${TOKEN} ]; then
#   mkdir -p $TOKEN
# fi
# echo "[LOG] MSPM encoding for xgiga en"
# LG="en"
# LGTAG="en_XX"
# TASKTAG="summ"
# for SPLIT in ${PARTS[*]}
#   do
#     # echo $SPLIT
#     tokenize "${xgiga_dir}/$SPLIT.x.$LG" "$TOKEN/xgiga.notag.$SPLIT.$LG.spm.doc"
#     tokenize "${xgiga_dir}/$SPLIT.y.$LG" "$TOKEN/xgiga.notag.$SPLIT.$LG.spm.sum"
#     cat "$TOKEN/xgiga.notag.$SPLIT.$LG.spm.doc" | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s> [${TASKTAG}]/" -e "s/^/[${LGTAG}] /" > "$TOKEN/xgiga.$SPLIT.$LG.spm.doc"
#     cat "$TOKEN/xgiga.notag.$SPLIT.$LG.spm.sum" | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${LGTAG}] /" > "$TOKEN/xgiga.$SPLIT.$LG.spm.sum"
#   done

# # mspm encoding for cc100
# echo "[LOG] MSPM encoding for cc100"
# TASKTAG="denoise"
# declare -A LANG_TAGS
# LANG_TAGS=([en]="en_XX" [zh]="zh_CN" [fr]="fr_XX")

# for LG in en zh fr
# do
#   for SPLIT in ${cc100_PARTS[*]}
#   do
#     LGTAG=${LANG_TAGS[$LG]}
#     tokenize "${cc100_dir}/$LG.first500k.txt.${SPLIT}" "$TOKEN/cc100.notag.$SPLIT.$LG.spm"
#     cat "$TOKEN/cc100.notag.$SPLIT.$LG.spm" | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s> [${TASKTAG}]/" -e "s/^/[${LGTAG}] /" > "$TOKEN/cc100.$SPLIT.$LG.spm.doc"
#     cat "$TOKEN/cc100.notag.$SPLIT.$LG.spm" | sed -e "s/< q >/ <\/s>/g" -e "s/^/<s> /" -e "s/$/ <\/s>/" -e "s/^/[${LGTAG}] /" > "$TOKEN/cc100.$SPLIT.$LG.spm.sum"
#   done
# done

# merge datas
cd utils
# for SPLIT in ${cc100_PARTS[*]}
# do
  # python3 mergeFileByLength.py --inputs $TOKEN/xgiga.$SPLIT.en.spm.sum $TOKEN/cc100.$SPLIT.en.spm.sum $TOKEN/cc100.$SPLIT.fr.spm.sum $TOKEN/cc100.$SPLIT.zh.spm.sum --output $TOKEN/$SPLIT.merged.spm.sum
# done
# python3 mergeFileByLength.py --input-prefix $TOKEN/xgiga.train.en.spm $TOKEN/cc100.train.en.spm $TOKEN/cc100.train.fr.spm $TOKEN/cc100.train.zh.spm --output-prefix $TOKEN/train.merged.spm -b 5
# python3 mergeFileByLength.py --input-prefix $TOKEN/xgiga.dev.en.spm $TOKEN/cc100.dev.en.spm $TOKEN/cc100.dev.fr.spm $TOKEN/cc100.dev.zh.spm --output-prefix $TOKEN/dev.merged.spm -b 8

python3 filterByMaxSize.py /mnt/bd/lab-wxz/clt/langReverse/MSPM/train.merged.spm.doc /mnt/bd/lab-wxz/clt/langReverse/MSPM/train.merged.spm.sum /mnt/bd/lab-wxz/clt/langReverse/MSPM/filtered/ 512
python3 filterByMaxSize.py /mnt/bd/lab-wxz/clt/langReverse/MSPM/dev.merged.spm.doc /mnt/bd/lab-wxz/clt/langReverse/MSPM/dev.merged.spm.sum /mnt/bd/lab-wxz/clt/langReverse/MSPM/filtered/ 512

cd ..
binary $TOKEN/filtered ${out_dir}/data-bin "merged"
