#!/bin/sh

#SET THIS TO THE PATH OF EMNLP_MATERIAL ON YOUR MACHINE
HOMEF="CHANGE THIS TO THE FOLDER THIS FILE IS IN"

# this script converts the Kim&Mooney-data to the format needed by our scripts
# the files are assumed to reside in ./xml-data

# it also creates the scripts which run the experiments, and the Grammars the experiments are run with

#convert the korean data to utf-8

mkdir -p xml-data/data-kr/gold_utf8
mkdir -p xml-data/data-kr/training_utf8

for i in 1 2 3 4
do
	iconv -f euc-kr -t utf-8 xml-data/data-kr/gold/200$i-final-gold | sed 's/euc-kr/utf-8/i' > xml-data/data-kr/gold_utf8/200$i-final-gold
	iconv -f euc-kr -t utf-8 xml-data/data-kr/training/200${i}final-train | sed 's/euc-kr/utf-8/i' > xml-data/data-kr/training_utf8/200$i-final-training
done

#converting the English data
python src/data/processData.py xml-data/data/training xml-data/data/gold experiments/EnglishData
#converting the Korean data
python src/data/processData.py xml-data/data-kr/training_utf8 xml-data/data-kr/gold_utf8 experiments/KoreanData


#setting up the training-files and the yld-files for evaluation

#trainingfiles - leave-one-out training files, all possible 4 combinations, for both English and Korean
cat experiments/EnglishData/training/training1.txt experiments/EnglishData/training/training2.txt experiments/EnglishData/training/training3.txt | sed 's/.\+::\(.\+\)$/\1/g' > experiments/EnglishData/training/leave4.train
cat experiments/EnglishData/training/training4.txt experiments/EnglishData/training/training2.txt experiments/EnglishData/training/training3.txt | sed 's/.\+::\(.\+\)$/\1/g' > experiments/EnglishData/training/leave1.train
cat experiments/EnglishData/training/training1.txt experiments/EnglishData/training/training4.txt experiments/EnglishData/training/training3.txt | sed 's/.\+::\(.\+\)$/\1/g' > experiments/EnglishData/training/leave2.train
cat experiments/EnglishData/training/training1.txt experiments/EnglishData/training/training2.txt experiments/EnglishData/training/training4.txt | sed 's/.\+::\(.\+\)$/\1/g' > experiments/EnglishData/training/leave3.train

cat experiments/KoreanData/training/training1.txt experiments/KoreanData/training/training2.txt experiments/KoreanData/training/training3.txt | sed 's/.\+::\(.\+\)$/\1/g' > experiments/KoreanData/training/leave4.train
cat experiments/KoreanData/training/training4.txt experiments/KoreanData/training/training2.txt experiments/KoreanData/training/training3.txt | sed 's/.\+::\(.\+\)$/\1/g' > experiments/KoreanData/training/leave1.train
cat experiments/KoreanData/training/training1.txt experiments/KoreanData/training/training4.txt experiments/KoreanData/training/training3.txt | sed 's/.\+::\(.\+\)$/\1/g' > experiments/KoreanData/training/leave2.train
cat experiments/KoreanData/training/training1.txt experiments/KoreanData/training/training2.txt experiments/KoreanData/training/training4.txt | sed 's/.\+::\(.\+\)$/\1/g' > experiments/KoreanData/training/leave3.train

#gold-ylds
for i in 1 2 3 4
do
	cat experiments/EnglishData/gold/gold${i}.txt | sed 's/^.\+::\(.\+\) --> .\+$/\1/g' > experiments/EnglishData/gold/gold${i}.yld
	cat experiments/KoreanData/gold/gold${i}.txt | sed 's/^.\+::\(.\+\) --> .\+$/\1/g' > experiments/KoreanData/gold/gold${i}.yld
done


#creating the file-structure
mkdir -p experiments/WordOrder/English/jitter
mkdir -p experiments/WordOrder/Korean/jitter

mkdir -p experiments/NoWordOrder/English/jitter
mkdir -p experiments/NoWordOrder/Korean/jitter

#creating the Grammars
echo "Creating the Grammars"
python src/grammars/NoWordOrder.py experiments/EnglishData/hashToCont.txt experiments/EnglishData/meanings.txt experiments/EnglishData/vocabulary.txt ./experiments/NoWordOrder/English/NoWordOrder.pcfg
python src/grammars/NoWordOrder.py experiments/KoreanData/hashToCont.txt experiments/KoreanData/meanings.txt experiments/KoreanData/vocabulary.txt ./experiments/NoWordOrder/Korean/NoWordOrder.pcfg

python src/grammars/WordOrder.py experiments/EnglishData/hashToCont.txt experiments/EnglishData/meanings.txt experiments/EnglishData/vocabulary.txt ./experiments/WordOrder/English/WordOrder.pcfg
python src/grammars/WordOrder.py experiments/KoreanData/hashToCont.txt experiments/KoreanData/meanings.txt experiments/KoreanData/vocabulary.txt ./experiments/WordOrder/Korean/WordOrder.pcfg


#setting up the scripts
echo "Setting up the scripts"


for lang in English Korean
do
  for wo in WordOrder NoWordOrder
  do
    for i in 1 2 3 4
    do
	echo "#! /bin/sh" > experiments/${wo}/${lang}/jitter/leave${i}_0.1.sh
	echo "#PBS -l nodes=1:ppn=2,walltime=48:00:00,vmem=12000mb" >> experiments/${wo}/${lang}/jitter/leave${i}_0.1.sh
	echo "cd ${HOMEF}/experiments/${wo}/${lang}/jitter" >> experiments/${wo}/${lang}/jitter/leave${i}_0.1.sh
	echo "date > leave${i}_0.1.time" >> experiments/${wo}/${lang}/jitter/leave${i}_0.1.sh
	echo "${HOMEF}/software/inside-outside/io -d 1000 -j 0.1 -a 0.1 -g ../${wo}.pcfg -T ./leave${i}_0.1.trace ${HOMEF}/experiments/${lang}Data/training/leave${i}.train >& leave${i}_0.1.out" >> experiments/${wo}/${lang}/jitter/leave${i}_0.1.sh
	echo "date >> leave${i}_0.1.time" >> experiments/${wo}/${lang}/jitter/leave${i}_0.1.sh
	for alpha in 0.1 0.5 1.0
	do
		echo "#! /bin/sh" > experiments/${wo}/${lang}/leave${i}_${alpha}.sh
		echo "#PBS -l nodes=1:ppn=2,walltime=48:00:00,vmem=12000mb" >> experiments/${wo}/${lang}/leave${i}_${alpha}.sh
		echo "cd ${HOMEF}/experiments/${wo}/${lang}/" >> experiments/${wo}/${lang}/leave${i}_${alpha}.sh
		echo "date > leave${i}_${alpha}.time" >> experiments/${wo}/${lang}/leave${i}_${alpha}.sh
		echo "${HOMEF}/software/inside-outside/io -d 1000 -a ${alpha} -g ./${wo}.pcfg -T ./leave${i}_${alpha}.trace ${HOMEF}/experiments/${lang}Data/training/leave${i}.train >& leave${i}_${alpha}.out" >> experiments/${wo}/${lang}/leave${i}_${alpha}.sh
		echo "date >> leave${i}_${alpha}.time" >> experiments/${wo}/${lang}/leave${i}_${alpha}.sh
	done
      done
    done
done
