#!/bin/bash

if [ $# != 5 ]
then
  echo "Usage is $0 input-dir fileset results-dir temp-dir TRAIN/TUNE/TEST"
  echo "e.g. $0 ~/corpora/wmt10/training europarl-v5 ~/results ~/temp TRAIN"
  echo "e.g. $0 ~/corpora/wmt10/dev news-test2008 ~/results ~/temp TUNE"
  echo "Previous step's results will be taken from results-dir/tokenized and results-dir/parsed"
  echo "Results will be stored in results-dir/reordered"
  exit
fi

input_dir=$1
fileset=$2
output_dir=$3
temp_dir=$4
phase=$5

tokenized_dir=$output_dir/tokenized
parsed_dir=$output_dir/parsed
reordered_dir=$output_dir/reordered

mkdir -p $reordered_dir $temp_dir

reorderer=/home/showlett/phd/code/reordering/Collins_rules.py
en_tokenizer="/usr/local/bin/moses-scripts/scripts-20110118-1208/tokenizer/tokenizer.perl -l en"

if [[ $phase == "TRAIN" ]]
then
  de_filename=$fileset.de-en.de
  en_filename=$fileset.de-en.en

  $reorderer $tokenized_dir/$de_filename $parsed_dir/$de_filename $temp_dir/$de_filename
  cat $temp_dir/$de_filename | sed 's/\*LRB\*/(/g' | sed 's/\*RRB\*/)/g' > $reordered_dir/$de_filename
  rm $temp_dir/$de_filename

  $en_tokenizer < $input_dir/$en_filename > $reordered_dir/$en_filename

else
  de_filename=$fileset-src.de
  en_filename=$fileset-ref.en.sgm

  $reorderer $tokenized_dir/$de_filename $parsed_dir/$de_filename $temp_dir/$de_filename
  cat $temp_dir/$de_filename | sed 's/\*LRB\*/(/g' | sed 's/\*RRB\*/)/g' > $reordered_dir/$de_filename
  rm $temp_dir/$de_filename

  cp $input_dir/$en_filename $reordered_dir/$en_filename

#  if [[ $phase == "TEST" ]]
#  then
    origde_filename=$fileset-src.de.sgm
    cp $input_dir/$origde_filename $reordered_dir/$origde_filename
#  fi

fi
