#!/usr/bin/env bash

# This script produces the results for the submitted paper on Yahoo Answer
# topic classication dataset.


myshuf() {
  perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
}

normalize_text() {
  tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
    sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
        -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
        -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
}


DATASET=(
  yahoo_answers
)

ID=(
  0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU # yahoo_answers
)


DATADIR=./Data
RESULTDIR=ExpResults
RAWDATASET=${DATADIR}/yahoo_answers_csv.tar.gz
FastTextTool=./Tools/fastText
KenSlmTool=./Tools/kenlm/build/bin

mkdir -p "${RESULTDIR}"
mkdir -p "${DATADIR}"

for i in 0
do
  echo "Downloading dataset ${DATASET[i]}"
  if [ ! -f "${DATADIR}/${DATASET[i]}_csv.tar.gz" ]
  then
    wget -c "https://googledrive.com/host/${ID[i]}" -O "${DATADIR}/${DATASET[i]}_csv.tar.gz"
  fi
done


### Preparing data corpus
mkdir -p ${RESULTDIR}
echo "Extracting public yahoo answers dataset from: ${RAWDATASET}"
if [ ! -f "${RESULTDIR}/yahoo_answers.unlabeled.train" ]
then
  tar -xzvf "${RAWDATASET}" -C "${RESULTDIR}"
  cat "${RESULTDIR}/yahoo_answers_csv/train.csv" | normalize_text > "${RESULTDIR}/yahoo_answers.full.train"
  cat "${RESULTDIR}/yahoo_answers_csv/test.csv" | normalize_text > "${RESULTDIR}/yahoo_answers.test"
  cp ${RESULTDIR}/yahoo_answers_csv/classes.txt  ${RESULTDIR}/yahoo_answers.classes.txt

  #split full 1.4M labelled train data into two parts: 100K labeled data and 1.3M (pretended) unlabeled data
  head -n100000 ${RESULTDIR}/yahoo_answers.full.train  > ${RESULTDIR}/yahoo_answers.labeled.train
  tail -n1300000 ${RESULTDIR}/yahoo_answers.full.train  > ${RESULTDIR}/yahoo_answers.unlabeled.train
fi



##########: Result #1: 1.4M Supervised Learning ###############
WorkDir=${RESULTDIR}/UpperBoundBaseline
echo "Testing on 1.4M supervised learning "
${FastTextTool}/fasttext supervised -input "${RESULTDIR}/yahoo_answers.full.train" \
  -output "${WorkDir}/maintask.model" -dim 10 -lr 0.1 -wordNgrams 2 \
  -minCount 1 -bucket 10000000 -epoch 5 -thread 1
${FastTextTool}/fasttext test "${WorkDir}/maintask.model.bin" \
  "${RESULTDIR}/yahoo_answers.test"


##########: Result #2: 100K  Supervised Learning ###############

WorkDir=${RESULTDIR}/LowerBoundBaseline
echo "Testing on 100K supervised learning "
${FastTextTool}/fasttext supervised -input "${RESULTDIR}/yahoo_answers.labeled.train" \
  -output "${WorkDir}/maintask.model" -dim 10 -lr 0.1 -wordNgrams 2 \
  -minCount 1 -bucket 10000000 -epoch 5 -thread 1
${FastTextTool}/fasttext test "${WorkDir}/maintask.model.bin" \
  "${RESULTDIR}/yahoo_answers.test"


##########: Result #3: Co-Training ###############
WorkDir=${RESULTDIR}/CoTraining
mkdir -p  ${WorkDir}
echo "Testing 100k labeled data + 1.3M unlabeled with co-training  baseline"
python cotraining.py $WorkDir $FastTextTool


##########: Result #4: Self-Training ###############
WorkDir=${RESULTDIR}/SelfTraining
mkdir -p  ${WorkDir}
echo "Testing 100k labeled data + 1.3M unlabeled with self-training  baseline"
python selfTraining.py $WorkDir $FastTextTool


##########: Result #5:  Active learning with 100K labeled data + 1.3M unlabeled data  ###############
#experimented corpus enriching size from 100K,200K, 400K, up to 1M to see when it can converge to upper bound
WorkDir=${RESULTDIR}/ActiveLearning
mkdir -p  ${WorkDir}
echo "Testing 100k labeled data + 1.3M unlabeled Active Learning baseline"
python activeLearning.py $WorkDir $FastTextTool



##########: Result #6: EMAEC without Enriched Data ###############
WorkDir=${RESULTDIR}/EmaecLearning
mkdir -p  ${WorkDir}
echo "Training SLMs based on 100K labeled data ..."
python trainSLMs.py $WorkDir $KenSlmTool labeled 3

echo "Testing  EMAEC Learning with 100k labeled data + 1.3M unlabeled Parallel Learning"
python emaecLearning.py $WorkDir $FastTextTool


##########: Result #7: EMAEC with Enriched Data ###############

echo "Testing  EMAEC Learning with 100k labeled data + 1.3M simulated user feedback data at diffrent level"
echo "Blurring unlabeled data to simulate user feedback behavior results ... "
python blurringLabels.py ${RESULTDIR}


for pct in 5 10 15 20 30 40 50 60 70
do
    WorkDir=${RESULTDIR}/EmaecLearningBlurPL${pct}
    mkdir -p  ${WorkDir}

    echo "Training SLMs based on labeled data ..."
    python trainSLMs.py $WorkDir $KenSlmTool labeled 3

    echo "Testing  EMAEC Learning with 100k labeled data + 1.3M blurred unlabeled data with "${pct}" percentage"
    python emaecLearningBlured.py $WorkDir $FastTextTool $pct
done


#########: Result #8: Supervised learning with  Noisy Enriched Data at different noise levels###############

echo "Testing  Supervised baseline method with 100k labeled data + 1.3M simulated user feedback data at different noise level"

for pct in 5 10 15 20 30 40 50 60 70
do
    WorkDir=${RESULTDIR}/BaselineSupervisedLearningBlurPL${pct}
    mkdir -p  ${WorkDir}

    echo "Testing  supervised baseline method with 100k labeled data + 1.3M blurred unlabeled data with "${pct}" percentage"

    ${FastTextTool}/fasttext supervised -input "${RESULTDIR}/yahoo_answers.blurred${pct}.train" \
      -output "${WorkDir}/maintask.model" -dim 10 -lr 0.1 -wordNgrams 2 \
      -minCount 1 -bucket 10000000 -epoch 5 -thread 1
    ${FastTextTool}/fasttext test "${WorkDir}/maintask.model.bin" \
      "${RESULTDIR}/yahoo_answers.test"

done

