#!/bin/bash

if [ $# != 6 ]
then
  echo "Usage is $0 grammar input-dir input-file results-dir temp-dir num-batches"
  echo "e.g. $0 german.gr ~/corpora/wmt10/training europarl-v5.de-en.de ~/results ~/temp 20"
  echo "Results will be stored in results-dir/tokenized and results-dir/parsed"
  exit
fi

grammar=$1
input_dir=$2
filename=$3
output_dir=$4
temp_dir=$5
batches=$6

tokenized_dir=$output_dir/tokenized
parsed_dir=$output_dir/parsed

mkdir -p $tokenized_dir $parsed_dir $temp_dir

input=$input_dir/$filename
tokenized=$tokenized_dir/$filename
parsed=$parsed_dir/$filename

moses_scripts=/usr/local/bin/moses-scripts/scripts-20110118-1208
unwrap=$moses_scripts/ems/support/input-from-sgm.perl
tokenizer="$moses_scripts/tokenizer/tokenizer.perl -l de"

parser_dir=/home/showlett/phd/code/parsing
parser=$parser_dir/distributed_parser


if [[ "$input" =~ ".*\.sgm" ]]
then
  $unwrap < $input > $temp_dir/unwrapped
  tokenized=`echo $tokenized | sed 's/\.sgm$//'`
  parsed=`echo $parsed | sed 's/\.sgm$//'`
else
  cp $input $temp_dir/unwrapped
fi

cat $temp_dir/unwrapped | $tokenizer 2> /dev/null | sed 's/(/\*LRB\*/g' | sed 's/)/\*RRB\*/g' > $tokenized
$parser $grammar $tokenized $parsed $temp_dir $batches

rm $temp_dir/unwrapped
