#!/bin/bash
###############################################################################
##                                                                           ##
## This file is part of ModelBlocks. Copyright 2009, ModelBlocks developers. ##
##                                                                           ##
##    ModelBlocks is free software: you can redistribute it and/or modify    ##
##    it under the terms of the GNU General Public License as published by   ##
##    the Free Software Foundation, either version 3 of the License, or      ##
##    (at your option) any later version.                                    ##
##                                                                           ##
##    ModelBlocks is distributed in the hope that it will be useful,         ##
##    but WITHOUT ANY WARRANTY; without even the implied warranty of         ##
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          ##
##    GNU General Public License for more details.                           ##
##                                                                           ##
##    You should have received a copy of the GNU General Public License      ##
##    along with ModelBlocks.  If not, see <http://www.gnu.org/licenses/>.   ##
##                                                                           ##
###############################################################################

###############################################################################
# check_binarizer.sh 
#
# This script should be used to check the correctness of the treebinarizer when 
# making changes to it.
#
# "testpball" should be available and contains all the prop-bank trees for checking
#
# HEADMARK and TAILMARK isolate the chunk to check in "testpball"
# E.g. to check from sentence 1000th to 6000th, use HEADMARK=6000, TAILMARK=5000  
#
# 2 levels (or modes) of checking:
#
# Mode 1: check the final output only. It does the check of the (changed)
#   treebinarize_srl.pl on both srl and non-srl input trees. The comparision
#   is relative to the output of the original treebinarizer.pl. Since output
#   file is relatively small for the diff command, the entire "testpball"
#   can be use at once. Provide the 3rd parameter to run this mode.
#   E.g. scripts/check_binarizer.sh 112917 112917 1
#   check both test112917.result and test112917.result.srlbinarizer 
#   Ideally these 2 files should be empty. Inspect if any of them is not.
#
# Mode 2: check step-by-step transformation of the binarizer. Should do about
#   5000 sentences at a time or the result is too large for the diff to work.
#   Inspect the result file and spot out the rule that was not triggered (or
#   equivalently, unexpectedly triggered) to lead to different value of $_
#   E.g. scripts/check_binarizer.sh 11000 5000
#   check file test11000.debug.result when done. Good if this file shows
#   only difference in line number.
#
# Author: Luan Nguyen
###############################################################################

HEADMARK="$1";
TAILMARK="$2";
OUTERR="$3"; #any 3rd param will do the stdout. Otherwise, use the stderr. May want to redirect the un-needed stdout or stderr to /dev/null

if [ $# -eq 0 ] ; then
	HEADMARK=`wc -l genmodel/all.wsj.trees | cut -f4 -d" "`
	TAILMARK=$HEADMARK
	OUTERR="1"
	echo "Both HEADMARK and TAILMARK set to $HEADMARK. Check stdout"
fi

head -$HEADMARK genmodel/all.wsj.trees | tail -$TAILMARK > testpball_$HEADMARK;
cat testpball_$HEADMARK | sed 's/:[^):]*)/)/g' > testtball_$HEADMARK;

if [ -n $OUTERR ] ; then
	echo "Check binary tree outputs"
	cat testtball_$HEADMARK | perl scripts/treebinarize.pl > testtball_$HEADMARK.bintree 2> /dev/null;
	cat testpball_$HEADMARK | perl scripts/treebinarize.pl > testpball_$HEADMARK.bintree 2> /dev/null;
	cat testpball_$HEADMARK.bintree | sed 's/:[^):]*)/)/g' > testpball_$HEADMARK.bintree.strip.srl;
	diff -b testtball_$HEADMARK.bintree testpball_$HEADMARK.bintree.strip.srl > test$HEADMARK.result;
	if [ $? -eq 0 ] ; then
		echo "Good. testtball_$HEADMARK.bintree and testpball_$HEADMARK.bintree.strip.srl are the same";
	else
		echo "Not good. testtball_$HEADMARK.bintree and testpball_$HEADMARK.bintree.strip.srl are not the same.";
		echo "Check diff file test$HEADMARK.result for details";
	fi
else
	echo "Check traces of the binarizer"
	cat testtball_$HEADMARK | perl scripts/treebinarize.pl -d 2> testtball_$HEADMARK.debug.bintree > /dev/null;
	cat testpball_$HEADMARK | perl scripts/treebinarize.pl -d 2> testpball_$HEADMARK.debug.bintree > /dev/null;
	cat testpball_$HEADMARK.debug.bintree | sed 's/:[^)>}: ]*\([)>}]\)/\1/g' > testpball_$HEADMARK.debug.bintree.strip.srl;
	#note that this diff use -wb to ignore all white-space and it may not be very accurate. Always check the output (above) to make sure
	diff -bw testtball_$HEADMARK.debug.bintree testpball_$HEADMARK.debug.bintree.strip.srl > test$HEADMARK.debug.result;
	grep "(" test$HEADMARK.debug.result;
	if [ $? -eq 1 ] ; then
		echo "Good. testtball_$HEADMARK.debug.bintree and testpball_$HEADMARK.debug.bintree.strip.srl are the same";
	else
		echo "Not good. testtball_$HEADMARK.debug.bintree and testpball_$HEADMARK.debug.bintree.strip.srl are not the same.";
		echo "Check diff file test$HEADMARK.debug.result for details";
	fi
fi

