#!/bin/csh -f

# pass the input file to modify and a flag set to 1 if you want to remove EDITED portions prior to training.  (BOTH SWBD and FISHER)
set input=$1
# Set to editp 1 to remove EDITED constituents; xxp to 1 to remove XX constituenrs; typop to 1 to remove TYPO bracket, emptyp to 1 to remove empty nodes.
set editp=0
set xxp=1
set typop=1
set emptyp=1
set ip=1

# create a variable to keep track of the directory in which the tool is sitting (in order to invoke other scripts that are there).
#set here=/export/ws05pssed/APPS/clean-tool #`dirname $0`
set here=`dirname $0`

# New temp names to support parallel processing
#set temp=(`mktemp temp.XXXXXX`)
set temp=_tmp

# Remove comment lines.  Note that the edit is done directly on input. (SWBD)
perl -pi -e 's/^\*x.*$//g' $input

# delete punctuation prioperly and improperly signalled (BOTH SWBD and FISHER)
perl -pi -e 's/\([A-Z\,\.\:] [\,\?\.\!\-]+\)//g' $input

# delete words with dashes ending them... illegal cue to disfluency...
# then delete empty categories we've created...
perl -pi -e 's/\([^\)\(]+ +[^\(\) ]+-\)//g' $input


# Put one parse per line to make pattern matching easier and get rid of extra newlines
$here/one-line $input

#perl -pi -e 's/\([^ \)\(]*\s*\)//g' $input
perl -pi -e 'while(s/\([^ \)\(]*\s*\)//g){}' $input

# Remove code lines (SWBD)
perl -pi -e 's/^\(S1 \(CODE.*$//g' $input

# Keep the first of A|B or A^B (SWBD)
perl -pi -e 's/(\([^\|\(\)]+)\|\S+/$1/g' $input
perl -pi -e 's/(\([^\|\(\)]+)\^\S+/$1/g' $input

# Remove remaining ^ marks (SWBD)
perl -pi -e 's/\^//g' $input

# downcase terminal words (SWBD)
perl -pi -e 's/\(([A-Z]+[\$]?)\s+([A-Z][\w\-]*)\)/($1 \L$2\E)/g' $input

# remove disfluency and IP nodes.
if($ip>0) then
set ip="$here/tsurgeon-files/ip-fix"
java -cp stanford-tools.jar -mx500m edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon -treeFile $input $ip > $temp
mv $temp $input
endif

# remove RM, RS, and -DFL- nodes.
set dis="$here/tsurgeon-files/dis-fix"
set dis2="$here/tsurgeon-files/dis2-fix"
java -cp stanford-tools.jar -mx500m edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon -treeFile $input $dis $dis2 > $temp
mv $temp $input


# remove TYPO and promote its arguments (SWBD and FISHER)
# To delete TYPO's set $typop to 1.
if($typop>0) then
    set typo="$here/tsurgeon-files/typo-fix"
    java -cp stanford-tools.jar -mx500m edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon -treeFile $input $typo > $temp
    mv $temp $input
endif

# Remove trace nodes
if($emptyp>0) then
    set none="$here/tsurgeon-files/none-fix"
    java -cp stanford-tools.jar -mx500m edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon -treeFile $input $none > $temp
    mv $temp $input
endif

#clean edits if you'd like; to do so set $editp flag to a number greater than 0!(BOTH SWBD and FISHER)
if($editp>0) then
    set edit="$here/tsurgeon-files/edit-fix"
    java -cp stanford-tools.jar -mx500m edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon -treeFile $input $edit > $temp
    mv $temp $input
endif

# delete partial words and MUMBLEx; XX is a preterminal 
# To delete XX's set $xxp to 1.
# Note that these are the only partial words that I delete for training.
# Their class is unknown; hence, the odd preterminal.  I have chosen to keep
# the partial words and their constituents because the annotators could 
#identify the word class.  To delete these partial; words would, I believe, be
# harmful to the grammar extracted.  (SWBD and FISHER)
if($xxp>0) then
    set xx="$here/tsurgeon-files/xx-fix"
    java -cp stanford-tools.jar -mx500m edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon -treeFile $input $xx > $temp
    mv $temp $input
endif

#remove empty lines created by tsurgeon which are signalled as null
perl -pi -e 'print STDERR "Empty line in file\n" if (s/^null$//g);' $input

$here/one-line $input
echo "" >> $input


