#!/bin/bash

if [ $# != 5 ]
then
  echo "Usage is $0 grammar-file input-file output-file temp-dir num-batches"
  echo "Note: input-file must already be tokenized"
  exit
fi

parser=/usr/local/berkeleyparser/BerkeleyParser.jar

grammar=$1
input=$2
output=$3
tempdir=$4
batches=$5

mkdir -p $tempdir

totallines=`wc -l $input | cut -d ' ' -f 1`
lines=`expr $totallines / $batches + 1`
split -l $lines $input $tempdir/parserin_
jobids=''

for infile in `ls $tempdir/parserin_*`
do
  outfile=`echo $infile | sed 's/parserin/parserout/'`
  errfile=`echo $infile | sed 's/parserin/parsererr/'`

  prefix=`echo $infile | sed 's/parserin/parserscript/'`

  jobscript=$prefix.sh
  qsubout=$prefix.out
  qsuberr=$prefix.err

  echo "#!/bin/bash" > $jobscript  # this line may not be necessary
  echo "LANG=en_AU.utf8 java -Xmx9000m -jar $parser -gr $grammar -confidence -tree_likelihood < $infile > $outfile 2> $errfile" >> $jobscript

  qsub -l walltime=400:00:00,cput=400:00:00,nodes=1:ppn=4 -o $qsubout -e $qsuberr $jobscript > $jobscript.log 2>&1

  jobid=`head -n 1 $jobscript.log | cut -d ' ' -f 1`
  jobids=`echo $jobids:$jobid`
done

syncscript=$tempdir/syncscript.sh
checkpointfile=$tempdir/syncscript

echo "#!/bin/bash" > $syncscript
echo "date" >> $syncscript

rm -f $checkpointfile

qsub -W depend=afterok$jobids -o $checkpointfile -e /dev/null $syncscript > $syncscript.log 2>&1

while [ ! -e $checkpointfile ]
do
  sleep 10
done

cat $tempdir/parserout_* > $output

rm $tempdir/parserin_* $tempdir/parserout_* $tempdir/parsererr_* $tempdir/parserscript* $tempdir/syncscript*
