###############################################################################
##                                                                           ##
## This file is part of ModelBlocks. Copyright 2009, ModelBlocks developers. ##
##                                                                           ##
##    ModelBlocks is free software: you can redistribute it and/or modify    ##
##    it under the terms of the GNU General Public License as published by   ##
##    the Free Software Foundation, either version 3 of the License, or      ##
##    (at your option) any later version.                                    ##
##                                                                           ##
##    ModelBlocks is distributed in the hope that it will be useful,         ##
##    but WITHOUT ANY WARRANTY; without even the implied warranty of         ##
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          ##
##    GNU General Public License for more details.                           ##
##                                                                           ##
##    You should have received a copy of the GNU General Public License      ##
##    along with ModelBlocks.  If not, see <http://www.gnu.org/licenses/>.   ##
##                                                                           ##
###############################################################################


# cat wsj_0001.trees | perl scripts/treesed.pl
use Getopt::Std;

getopts("pd");

$remove_punct = 0;
if($opt_p){
  $remove_punct = 1; 
}

$DEBUG = 0;
if ($opt_d) {
  $DEBUG = 1;
}

sub debug {
  if ($DEBUG) {
    $msg = $_[1];
    print stderr $_[0] , " " , $msg, "\n";
  }
}

$SRL = "!colon![^>]*";

## for each tree...
$lineNum=1;
while ( <> ) {
  debug("***TB*** line ", $lineNum++);

  ## Make parentheses look standard...
  s/\( */(/g;
  s/ *\)/)/g;

  ## Remove repair annotation
  s/\(RM \(\-DFL\- \\\[\) \)//g;
  s/\(IP \(\-DFL\- \\\+\) \)//g;
  s/\(-DFL- E_S\)//g;
  s/\(-DFL- N_S\)//g;
  s/\(RS \(\-DFL\- \\\]\) \)//g;
  ## Sometimes that creates empty categories (especially in mis-annotated data)
  ## Remove those empty categories
  s/\([^ ]+[ ]+\)//;

  ## Collapse out everything between cat. and UNF
  s/\(([^ ]+)-[^ ]+-UNF/(\1-UNF/g;
  s/-UNF/UNF/g;
  # We wil undo this shift later, but for right now it's done so INTJ is not 
  # confused with IN (I think 4 underscores should be enough)
  s/INTJ/____INTJ/g;

  ## translate to parens...
  s/\[/\(/g;
  s/\]/\)/g;

  if($remove_punct == 1){
      # Remove/change punctuation...
      s/\([^ ]+ \.\.\.\)//g;
      s/\([^ ]+ \.\)//g;
      s/\([^ ]+ \!\)//g;
      s/\([^ ]+ \?\)//g;
      s/\([^ ]+ \,\)//g;
      s/\([^ ]+ *\([^ ]+ *\-\-\) *\)//g;  ## dash is nested... don't know why
      s/\([^ ]+ \-\-\)//g;
      s/\([^ ]+ \-\)//g;
      s/\([^ ]+ \;\)//g;
      s/\([^ ]+ \:\)//g;
      s/\([^ ]+ \`\)//g;
      s/\([^ ]+ \'\)//g;
      s/\([^ ]+ \`\`\)//g;
      s/\([^ ]+ \'\'\)//g;
      s/\([^ ]+ -L.B-\)//g;
      s/\([^ ]+ -R.B-\)//g;
  }

  # Get rid of typos (categories starting with ^)
  s/\(\^/\(/g;

  # Distinguish punctuation...
  s/\([^ ]+ \.\.\.(:[^\)]*)?\)/\(\.\.\. \.\.\.\1\)/g;
  s/\([^ ]+ \?(:[^\)]*)?\)/\(\? \?\1\)/g;
  s/\([^ ]+ *\([^ ]+ *\-\-(:[^\)]*)?\) *\)/\(\-\- \-\-\1\)/g;  ## dash is nested... don't know why
  s/\([^ ]+ \-\-(:[^\)]*)?\)/\(\-\- \-\-\1\)/g;
  s/\([^ ]+ \-(:[^\)]*)?\)/\(\-\- \-\-\1\)/g;
  s/\([^ ]+ \:(:[^\)]*)?\)/\(\: \:\1\)/g;
  s/\([^ ]+ \;(:[^\)]*)?\)/\(\; \;\1\)/g;
  s/\([^ ]+ \`(:[^\)]*)?\)/\(\` \`\1\)/g;
  s/\([^ ]+ \'(:[^\)]*)?\)/\(\' \'\1\)/g;

  ########## GLOBAL SYMBOL SHIFT
  #s/\!/\!exclamation\!/g;
  s/\~/\!tilde\!/g;
  #s/\`\`/\!openquote\!/g;
  #s/\`/\!openscare\!/g;
  s/\@/\!at\!/g;
  s/\#/\!pound\!/g;
  #s/\$/\!dollar\!/g;
  #s/\%/\!percent\!/g;
  #s/\^/\!carat\!/g;
  s/\&/\!ampersand\!/g;
  #s/\*/\!star\!/g;
  s/\-\-/\!dash\!/g;
  s/\+/\!plus\!/g;
  #s/\=/\!equals\!/g;
  s/\:/\!colon\!/g;
  s/\;/\!semi\!/g;
  #s/\"/\!dblquote\!/g;
  #s/\'\'/\!closequote\!/g;
  #s/\'/\!closescare\!/g;
  #s/\./\!period\!/g;
  #s/\,/\!comma\!/g;
  #s/([^\*])\?/\1\!question\!/g;
  s/\//\!slash\!/g;
  ##########

  # substitute -LRB- to \!LRB\!
  s/-L.B-/\!LRB\!/g;
  # substitute -RRB- to \!RRB\!
  s/-R.B-/\!RRB\!/g;
  # substitute -NONE- to \*NONE\*
  s/-NONE-/\*NONE\*/g;
#  # substitute *T* to \!t\! ("word" trace will get lowercased anyway)
#  s/\*T\*/\!t\!/g;
#  # substitute *RNR* to \!rnr\! ("word" trace will get lowercased anyway)
#  s/\*RNR\*/\!rnr\!/g;
#  # substitute NP-TMP to NP-tmp
#  s/NP[^ ]*\-TMP/NP-t/g;
  # substitute S*-ADV to S*-adv
  s/(S[^ ]*)\-ADV/\1-adv/g;
  # remove all other dash specifications beginning with capital letter
  s/\-[A-Z][A-Z]*([^ \)\]]*) /\1 /g;
  s/\-[A-Z][A-Z]*([^ \)\]]*) /\1 /g;
  s/\-[A-Z][A-Z]*([^ \)\]]*) /\1 /g;


  ## for each constituent...
  $step = 0;
  while ( $_ =~ /\([^\(\)]*\)/ ) {
    ## convert outer parens to braces...
    $_ =~ s/\(([^\(\)]*)\)/{\1}/;
    #################### ADD SED RULES HERE: apply rules to angles (children) within braces (consituent)...
    #print stderr "   $_\n";
    debug(++$step, "   $_");


    #### EMPTY CATEGORIES
    # delete ICH (moved -- not just raised -- modifier) and U (transposed currency symbol) traces; too hard to reconstruct
    s/{\*NONE\* .*ICH.*}//;
    s/{\*NONE\* *\*U\*(!colon!.*)? *}//;
    # eliminate expletive it cleft (a shame, but sent is too deeply attached)
    s/{NP[^ ]* +<(NP[^>]*)> +<S[^ ]*-empty 0($SRL)?> *}/{\1}/;
    # delete unary projections of deleted empty constituents
    s/{[^ ]* *}//;
    # fold empty category for currency into nonterm
    s/{QP([^ ]*) +(.*<\!pound\!.*)}/{NP-currunit\1 \2}/;
    s/{QP([^ ]*) +(.*<\$.*)}/{NP-currunit\1 \2}/;
    s/{QP([^ ]*) +(.*<\%.*)}/{NP-currunit\1 \2}/;
    s/{(NP[^ ]*|ADJP[^ ]*) (.*)<NP-currunit[^ ]* ([^>]*)> *<\*NONE\* [^>]*>([^}]*)}/{\1 \2 \3\4}/;   ## probably redundant now with *U* removed
    # fold empty X categories into X-empty nonterm
    s/{([^ ]*) +<\*NONE\* +\*t\*\-([0-9]+)($SRL)?> *}/{\1-empty-\2 0\3}/;
    s/{([^ ]*) +<\*NONE\* +\*rnr\*\-([0-9]+)($SRL)?> *}/{\1-empty-\2 0\3}/;
    s/{([^ ]*) +<\*NONE\* +\*($SRL)?> *}/{\1-empty-ctrl 0\2}/;
    if ( $_ =~ /\!colon\!(REL|rel)/ ) { #use this to check if input is srl tree
    	s/{([^ ]*) +<\*NONE\* +[^>]*($SRL)> *}/{\1-empty 0\2}/;	
    } else {
    	s/{([^ ]*) +<\*NONE\* +[^>]*> *}/{\1-empty 0}/;
    }

#    # rename VP with TO to infinitive VP-inf nonterm
#    s/{VP +(<TO[^>]*> *<VP[^>]*>[^ ]*)}/{VPto-inf \1}/;
#    # fold NP trace in S trace into infinitive VP-inf nonterm
#    s/{S +<NP-empty 0> *<VPto ([^>]*)> *}/{VPto \1}/;
    # fold NP trace in VP into passive VP nonterm
    s/{(VP[^ ]*|VBN[^ ]*|VBD[^ ]*) +(.*)<(VBN[^ ]*|VBD[^ ]*) +([^>]*)> *<NP[^ ]*-empty(-ctrl)? 0($SRL)?>(.*)}/{\1 \2<\3-v-bNP \4> \7}/;
    # fold empty NP in S with VPvb|VPvbp into S imperative
    s/{(S[^ ]*|PRN[^ ]*) +(.*)<[^ ]*NP[^ ]*-empty-ctrl 0($SRL)?> *<(VPvbp|VPvb|VBPvbp|VBvb)([^a-z][^>]*)>(.*)}/{\1 \2<Simp\5>\6}/;
    # fold empty NP in S with VPvbg into S pro progressive / passive / infinitive
    s/{(S[^ ]*) +(.*)<[^ ]*NP[^ ]*-empty(-ctrl)? 0($SRL)?> *<(VP|VB[A-Z]*|TO)(vbg|vbn|prd|vbd|vb|to)([^>]*)>(.*)}/{\1 \2<Spro\6\7>\8}/;
    # fold empty NP in S with VPto into S pro stative
    s/{(S[^ ]*) +(.*)<[^ ]*NP[^ ]*-empty(-ctrl)? 0($SRL)?> *(.*)<(ADJP|NP|PP)([^>]*)>(.*)}/{\1 \2\5<Sproprd\7>\8}/;
#    # fold NP trace in S with VP into VP nonterm
#    s/{S[^ ]* +<[^ ]*-empty 0> *<VP([^>]*)> *}/{VP\1}/;
#    # fold NP trace in S with VP into VP nonterm, subsuming args
#    s/{S[^ ]* +(.*)<[^ ]*-empty 0> *<(VP[^ ]*) ([^>]*)>(.*)}/{VP\2 \1<VP\2 \3>\4}/;
#    # fold NP trace in S with XP into XP nonterm
#    s/{S[^ ]* +<[^ ]*-empty 0> *<(ADJP[^>]*|NP[^>]*|PP[^>]*)> *}/{\1}/;
    # fold NP trace in SBAR into S nonterm
    s/{SBAR[^ \^]*([^ ]*) +<\*NONE\* [^>]*> *<S([^ \^]*)[^ \-]*([^ ]*) ([^>]*)> *}/{S\2\1\3 \4}/;
    s/{SBAR[^ \^]*([^ ]*) +<[^ ]*-empty-([0-9]*) 0($SRL)?> *<S([^ \^]*)[^ \-]*([^ ]*) ([^>]*)> *}/{S\2\1\5-\4 \6}/;
#    # redo some mislabeled rel clauses with subject extr
#    s/{(SBAR[^ ]*) +(<WH.*) +<(Simp|Spro[a-z]*)(.*)}/{WH\1 \2 <S-extrNP\4}/;
    # redo SBAR as WHSBAR
    s/{(SBAR[^ ]*) +(<WH.*) +(<S.*)}/{WH\1 \2 \3}/;
#    # last resort: nuke all remaining empty constituents w/o trace
#    s/{[^ ]* +<[A-Z]*-empty(-ctrl)? +[^>]*> *<([^>]*)> *}/{\2}/;
#    s/{[^ ]* +<([^>]*)> +<[A-Z]*-empty(-ctrl)? +[^>]*> *}/{\1}/;

    # turn NN in VP into VBG -- weird way to annotate gerunds
    s/{(VP[^ ]*) <NN[^ \^]*([^ ]*) ([^>]*)>(.*)}/{\1 <VBGvbg\2 \3>\4}/;

    #### TERMINALS
#    s/{(NN(?!P)S?) +([^ ]*)}/{\1nn \2}/;
    # give terminal symbols lower case and add lowercase pos to category
    if ($_ =~ /{(VB[A-Z]*|MD|TO|BES|HVS)([^ ]*) *([^ \/]*) *}/) { $c=$1; $u=$2; $p=lc($1); $w=lc($3); s/{.*}/{$c$p$u $w}/; }
#    if ($_ =~ /{(VB[A-Z]*|MD|TO|BES|HVS)([^ ]*) *([^ \/]*) *}/) { $c=$1; $u=$2; $p=lc($1); $w=$3; s/{.*}/{$c$p$u $w}/; } ## To use case in POS decisions
    # give terminal symbols lower case
    if ($_ =~ /{([A-Z\$]*)([^ ]*) *([^ \/]*) *}/) { $c="$1$2"; $w=lc($3); s/{.*}/{$c $w}/; }
#    if ($_ =~ /{([A-Z\$]*)([^ ]*) *([^ \/]*) *}/) { $c="$1$2"; $w=$3; s/{.*}/{$c $w}/; } ## To use case in POS decisions (with above)


    $LEXTPU="[,`'\\!\\.\\?^][ \\)>\\]]*[^\\)>\\]]*|^[^\\)>\\]]*";
    #$LEXTP="^[^\\)>\\]]*";
    $REXTPU="[ \\)>\\]]*[^\\)>\\]]*[,`'\\!\\.\\?]";


#     #### NOUN PHRASES
#     # right-binarize (basal) NPs as much as possible
#     s/{(NP|WHNP)[a-z]*([^ ]*) +(.*)<(?![-,`'\!])(?!`)([A-Z]*)([^ ]* [^>]*)> *<(NN[A-Z]*)([a-z]*)([^ ]* [^>]*)>(.*)}/\(\1\2 \3\{\6 <\4\5> <\6\7\8>\}\9\)/;
     # left-binarize NP genitive marker if left and right context reduced to nil
     s/{(NP|WHNP)[a-z]*([^ ]*) +( *)<(NN[A-Z]*|NP)([a-z]*)([^ ]* [^>]*)> *<(POS)([^ ]* [^>]*)>( *)}/\{\1pos <\4\5\6> <\7\8>\}/;
#     # grab NN|NP + comma + mod + internal comma as NP
#     s/{(NP)[a-z]*([^ ]*) +<(NN[A-Z]*|NP)n?n?([a-z]*)([^ ]* [^>]*)> *<, ,> *<((?=ADVP|PP|S|VP|WHSBAR|ADJP|NP)(?![^ ]*-rc)[^ ]*)([^>]*)>( *<, ,>.*)}/\(\1\2 \{\3 <\3nn\4\5> <\6-rc-mc [, ,] [\6\7]>\}\8\)/;
#     # grab NN|NP + comma + mod + external punct as NP
#     s/{(NP)[a-z]*([^ ]*) +<(NN[A-Z]*|NP)n?n?([a-z]*)([^ ]* [^>]*)> *<, ,> *<((?=ADVP|PP|S|VP|WHSBAR|ADJP|NP)(?![^ ]*-rc)[^ ]*)([^>]*)> *}($REXTPU)/\(\1\2 \{\3 <\3nn\4\5> <\6-rc-mc [, ,] [\6\7]>\}\)\8/;
# #    # grab NN|NP at left + comma + mod + comma
# #    s/{(NP)[a-z]*([^ ]*) +( *)<(NN[A-Z]*|NP)n?n?([a-z]*)([^ ]* [^>]*)> *<(ADVP|PP|S|VP|WHSBAR|ADJP|NP)([^ ]* [^>]*)>(.*)}/\(\1\2 \3\{\4 <\4nn\5\6> <\7\8>\}\9\)/;  ##>( *<.*)}
# #    # grab NN|NP at left + comma + mod + external comma
# #    s/{(NP)[a-z]*([^ ]*) +( *)<(NN[A-Z]*|NP)n?n?([a-z]*)([^ ]* [^>]*)> *<(ADVP|PP|S|VP|WHSBAR|ADJP|NP)([^ ]* [^>]*)>(.*)}/\(\1\2 \3\{\4 <\4nn\5\6> <\7\8>\}\9\)/;  ##>( *<.*)}
#     # grab mod to right of NN or NP, if left context reduced to nil
#     s/{(NP)[a-z]*([^ ]*) +( *)<(NN[A-Z]*|NP)n?n?([a-z]*)([^ ]* [^>]*)> *<(ADVP|PP|S|VP|WHSBAR|ADJP|NP)([^ ]* [^>]*)>(.*)}/\(\1\2 \3\{\4 <\4nn\5\6> <\7\8>\}\9\)/;  ##>( *<.*)}
# #    # NN projections ignore pos info
# #    s/{(NN)[A-Z]*([^ ]*) +(<.*)}/{\1\2 \3}/;
# #    # left-binarize NPs after left context reduced to nil
# #    s/{(NP)[a-z]*([^ ]*) +( *)<(NN[A-Z]*|NP)([a-z]*)([^ ]* [^>]*)> *<\, *\,> *<(PP|S|VP|WHSBAR)([^ ]* [^>]*)>(.*)}/\(\1\2 \3\{\4 <\4\5\6> <\, \,> <\7\8>\}\9\)/;  ##>( *<.*)}
#     # undo last unary N bar projection
#     s/\((NP)[a-z]*([^ ]*) +\{(NP|NN[A-Z]*)([a-z]*)([^ ]*) +(.*)\} *\)/{\1\2 \6}/;  ## +(\[

#    #### TIME NPS
#    # substitute NP-tmp
#    s/{NP[^ ]*-t[^ \-\=]*[^ ]* (.*)}/\(PP-t \{NP \1\}\)/g;
#    # remove all other dash specifications beginning with capital letter or number, not ending in dash
#    s/{([^ \-\=]*)[\-\=][\-\=A-Z0-9]*[^\-] /{\1 /g;

#     #### VERB PHRASES
#     # grab internal comma + S-adv|ADVP|RB|PP + comma + VB|VP as VP
#     s/{(VP|VB[A-Z]*)[a-z]*([^ ]*)(.*<, ,>) *<((?=S-adv|ADVP|RB|PP)[^ ]*)([^>]*)> *<, ,> *<(VB[A-Z]*|VP)([a-z]*)([^ ]*)([^>]*)>( *| *<[^A-Z].*)}/\(\1\7\2\3 \{\6\7\8 <\4-lc-mc [\4\5] [, ,]> <\6\7\8\9>\}$10\)/;
#     # grab external punct + S-adv|ADVP|RB|PP + comma + VB|VP as VP
#     s/($LEXTPU){(VP|VB[A-Z]*)[a-z]*([^ ]*) *<((?=S-adv|ADVP|RB|PP)[^ ]*)([^>]*)> *<, ,> *<(VB[A-Z]*|VP)([a-z]*)([^ ]*)([^>]*)>( *| *<[^A-Z].*)}/\1\(\2\7\3 \{\6\7\8 <\4-lc-mc [\4\5] [, ,]> <\6\7\8\9>\}$10\)/;
#     # right-binarize VPs after right context reduced to nil or punct constituent
#     s/{(VP|VB[A-Z]*)[a-z]*([^ ]*) +(.*)<(S-adv|ADVP|RB|PP)([^ ]* [^>]*)> *<(VB[A-Z]*|VP)([a-z]*)([^ ]* [^>]*)>( *| *<[^A-Z].*)}/\(\1\7\2 \3\{\6\7 <\4\5> <\6\7\8>\}\9\)/; ##(<.*)<
# #    # right-binarize VPs after right context does not contain normal constituents (i.e. contains punct like quote)
# #    s/{(VP|VB[A-Z]*)[a-z]*([^ ]*) +(.*)<(S-adv|ADVP|RB|PP)([^ ]* [^>]*)> *<(VB[A-Z]*|VP)([a-z]*)([^ ]* [^>]*)>( *<[^A-Z].*)}/   \(\1\7\2 \3\{\6\7 <\4\5> <\6\7\8>\}\9\)/; ##(<.*)<
#     # left-binarize VPs headed by VB or BES as much as possible
#     s/{(VP|VB[A-Z]*|SQ)[a-z]*([^ ]*) +(.*)<(VB[A-Z]*|BES)([a-z]*)([^ ]* [^>]*)> *<(?!CC)([A-Z]+)([^ ]* [^>]*)>(.*)}/\(\1\5\2 \3\{\4\5 <\4\5\6> <\7\8>\}\9\)/;
#     # left-binarize VPs headed by TO or MD as much as possible
#     s/{(VP|VB[A-Z]*|SQ)[a-z]*([^ ]*) +(.*)<(TO|MD)([a-z]*)([^ ]* [^>]*)> *<(VP|VB)([^ ]* [^>]*)>(.*)}/\(\1\5\2 \3\{VP\5 <\4\5\6> <\7\8>\}\9\)/;
#     # grab VP|VB + comma + mod + internal comma as VP
#     s/{(VP|VB[A-Z]*|SQ)[a-z]*([^ ]*) +(.*)<(VP[a-z]*)([^ ]*)([^>]*)> *<, ,> *<((?=PP|VP)[^ ]*)([^>]*)>( *<, ,>.*)}/\(\1\5\2 \3\{\4 <\4\5\6> <\7-rc-mc [, ,] [\7\8]>\}\9\)/;
#     # grab VP|VB + comma + mod + external punct as VP
#     s/{(VP|VB[A-Z]*|SQ)[a-z]*([^ ]*) +(.*)<(VP[a-z]*)([^ ]*)([^>]*)> *<, ,> *<((?=PP|VP)[^ ]*)([^>]*)> *}($REXTPU)/\(\1\5\2 \3\{\4 <\4\5\6> <\7-rc-mc [, ,] [\7\8]>\}\)\9/;
# debug($step, " \\ $_");
#     # left-binarize VPs headed by VP as much as possible
#     s/{(VP|VB[A-Z]*|SQ)[a-z]*([^ ]*) +(.*)<(VP)([a-z]*)([^ ]* [^>]*)> *<(PP|VP)([^ ]* [^>]*)>(.*)}/\(\1\5\2 \3\{\4\5 <\4\5\6> <\7\8>\}\9\)/;
# #    # VB projections clump pos info
# #    s/{(VBG|VBN)(vbg|vbn)([^ ]*) +(<.*)}/{VBprd\3 \4}/;
# #    s/{(VBZ|VBD)(vbz|vbd)([^ ]*) +(<.*)}/{VBtns\3 \4}/;
#     # undo last unary V bar projection
#     s/\((VP|VB[A-Z]*)[a-z]*([^ ]*) +\{(VP|VB[A-Z]*)([a-z]*)([^ ]*) +(.*)\} *\)/{\1\4\2 \6}/;  ## +(\[

#     #### SENTENTIAL PROJECTIONS
     # right-binarize NP VP as S
     s/{(S[A-Z]*)[a-z]*([^ ]*) +(.*)<(NP)([^ ]* [^>]*)> *<(VP)([a-z]*)([^ ]* [^>]*)>(.*)}/\(\1\2 \3\{S-h <\4\5> <\6\7\8>\}\9\)/;
#     # under SINV: right-binarize MOD VP as VP
#     s/{(SINV)[a-z]*([^ ]*) +(.*)<(S-adv|ADVP|RB[A-Z]*|PP)([^ ]* [^>]*)> *<(VP|VB[A-Z]*)([a-z]*)([^>]*)>(.*)}/\(\1\2 \3\{\6\7 <\4\5> <\6\7\8>\}\9\)/;
#     # under S: grab internal comma + S-adv|ADVP|RB|PP + comma + VB|VP as VP
#     s/{(S[A-Z]*)[a-z]*([^ ]*)(.*<, ,>) *<((?=S-adv|ADVP|RB|PP)[^ ]*)([^>]*)> *<, ,> *<(VB[A-Z]*|VP)([a-z]*)([^ ]*)([^>]*)>(.*)}/\(\1\7\2\3 \{\6\7\8 <\4-lc-mc [\4\5] [, ,]> <\6\7\8\9>\}$10\)/;
#     # under S: grab external punct + S-adv|ADVP|RB|PP + comma + VB|VP as VP
#     s/($LEXTPU){(S[A-Z]*)[a-z]*([^ ]*) *<((?=S-adv|ADVP|RB|PP)[^ ]*)([^>]*)> *<, ,> *<(VB[A-Z]*|VP)([a-z]*)([^ ]*)([^>]*)>(.*)}/\1\(\2\7\3 \{\6\7\8 <\4-lc-mc [\4\5] [, ,]> <\6\7\8\9>\}$10\)/;
#     # under S: right-binarize MOD VP as VP
#     s/{(S[A-Z]*)[a-z]*([^ ]*) +(<.*)<(S-adv|ADVP|RB[A-Z]*|PP)([^ ]* [^>]*)> *<(VP|VB[A-Z]*)([a-z]*)([^>]*)>(.*)}/\(\1\2 \3\{\6\7 <\4\5> <\6\7\8>\}\9\)/;
#     # under S: grab internal comma + S-adv|ADVP|RB|PP + comma + S as S
#     s/{(S[A-Z]*)[a-z]*([^ ]*)(.*<, ,>) *<((?=S-adv|ADVP|RB|PP)[^ ]*)([^>]*)> *<, ,> *<(\1)([a-z]*)([^ ]*)([^>]*)>(.*)}/\(\1\7\2\3 \{\6\7\8 <\4-lc-mc [\4\5] [, ,]> <\6\7\8\9>\}$10\)/;
#     # under S: grab external punct + S-adv|ADVP|RB|PP + comma + S as S
#     s/($LEXTPU){(S[A-Z]*)[a-z]*([^ ]*) *<((?=S-adv|ADVP|RB|PP)[^ ]*)([^>]*)> *<, ,> *<(\2)([a-z]*)([^ ]*)([^>]*)>(.*)}/\1\(\2\7\3 \{\6\7\8 <\4-lc-mc [\4\5] [, ,]> <\6\7\8\9>\}$10\)/;
#     # under S: right-binarize MOD + S as S
#     s/{(S[A-Z]*)[a-z]*([^ ]*) +(.*)<(S-adv|ADVP|RB[A-Z]*|PP)([^ ]* [^>]*)> *<(\1)([a-z]*)( [^>]*)>(.*)}/\(\1\7\2 \3\{\6\7 <\4\5> <\6\7\8>\}\9\)/;
#     # left-binarize S
#     s/{(S[A-Z]*)[a-z]*([^ ]*) +(.*)<(\1)([a-z]*)( [^>]*)> *<(ADVP|RB[A-Z]*|PP)([^ ]* [^>]*)>(.*)}/\(\1\5\2 \3\{\4\5 <\4\5\6> <\7\8>\}\9\)/;
# ##    # right-binarize S from above
# ##    s/{(S[^ ]*) +<(ADVP[^ ]*|PP[^ ]*) (.*>.*<.*)}/\(\1 \2 \{\1 \3\}\)/;
# ##    # left-binarize S from above
# ##    s/{(S[^ ]*) (.*>.*<.*) +<(ADVP[^ ]*|PP[^ ]*)}/\(\1 \{\1 \2\} \3\)/;
# #    # left-binarize VPs after left context reduced to nil
# #    s/{(VP[^ ]*|SQ[^ ]*) +( *)<(VB[^ ]*) ([^>]*)> *<([^ ]*) ([^>]*)> *(<.*)}/\(\1 \2\{\3 <\3 \4> <\5 \6>\}\7\)/;
#     # undo last unary S bar projection
#     s/\((S)[a-z]*([^ ]*) +\{(S)([a-z]*)([^ ]*) +(.*)\} *\)/{\1\4\2 \6}/;  ## +(\[
#     # redo Sto
#     s/{(S)(-.*)? +(<NP.*) +(<VPto.*)}/\(\1to\2 {\1to \3 \4}\)/;

#     #### ADJECTIVAL / ADVERBIAL PHRASES
#     # right-binarize ADJPs as much as possible
#     s/{(ADJP)[a-z]*([^ ]*) +(.*)<(RB[A-Z]*)([^ ]* [^>]*)> *<(JJ[A-Z]*)([a-z]*)([^ ]* [^>]*)>(.*)}/\(\1\2 \3\{\6 <\4\5> <\6\7\8>\}\9\)/;
#     # left-binarize ADJPs after left context reduced to nil
#     s/{(ADJP)[a-z]*([^ ]*) +( *)<(JJ[A-Z]*|ADJP)([a-z]*)([^ ]* [^>]*)> *<(PP|S)([^ ]* [^>]*)>(.*)}/\(\1\2 \3\{\4 <\4\5\6> <\7\8>\}\9\)/;  ##>( *<.*)}
#     # undo last unary A bar projection
#     s/\((ADJP)[a-z]*([^ ]*) +\{(JJ[A-Z]*)([a-z]*)([^ ]*) +(.*)\} *\)/{\1\2 \6}/;  ## +(\[
#     # right-binarize ADVPs as much as possible
#     s/{(ADVP)[a-z]*([^ ]*) +(.*)<(RB[A-Z]*)([^ ]* [^>]*)> *<(RB[A-Z]*)([a-z]*)([^ ]* [^>]*)>(.*)}/\(\1\2 \3\{\6 <\4\5> <\6\7\8>\}\9\)/;
#     # left-binarize ADVPs after left context reduced to nil
#     s/{(ADVP)[a-z]*([^ ]*) +( *)<(RB[A-Z]*|ADVP)([a-z]*)([^ ]* [^>]*)> *<(PP|S)([^ ]* [^>]*)>(.*)}/\(\1\2 \3\{\4 <\4\5\6> <\7\8>\}\9\)/;  ##>( *<.*)}
#     # undo last unary Ad bar projection
#     s/\((ADVP)[a-z]*([^ ]*) +\{(RB[A-Z]*)([a-z]*)([^ ]*) +(.*)\} *\)/{\1\2 \6}/;  ## +(\[
#     # annotate unary rb
#     s/{(ADVP[^ ]*) *<(RB[^ ]*) ([^ >]*)> *}/{\1 <\2-unary \3>}/;

#     #### PREPOSITIONAL PHRASES
# #    # annotate particles with word instead of pos
# #    s/{(PRT)[^ ]* *<(RP)[^ ]* ([a-z]*) *> *}/{\1\3 <\2\3 \3>}/;
     # annotate prepositions with word instead of pos
     s/{(IN)[a-z]*([^ ]*) *(of|that)(\!colon\!.*)?}/{\1\3\2 \3\4}/;
#     #s/{(PP|SBAR)[a-z]*([^ ]*) *<(IN|TO)(of|that|to)([^ ]* .*)}/{\1\4 <\3\4\5}/;
#     # left-binarize PPs/SBARs headed by IN or TO as much as possible
#     s/{(PP|SBAR)[a-z]*([^ ]*) +(.*)<(IN|TO)([a-z]*)([^ ]* [^>]*)> *<([A-Z]+)([^ ]* [^>]*)>(.*)}/\(\1\5\2 \3\{\1\5 <\4\5\6> <\7\8>\}\9\)/;
#     # right-binarize PPs after right context reduced to nil
#     s/{(PP)[a-z]*([^ ]*) +(.*)<(ADVP|RB|PP)([^ ]* [^>]*)> *<(PP)([a-z]*)([^ ]* [^>]*)>( *)}/\(\1\7\2 \3\{\6\7 <\4\5> <\6\7\8>\}\9\)/; ##(<.*)<
#     # undo last unary P bar projection
#     s/\((PP)[a-z]*([^ ]*) +\{(PP)([a-z]*)([^ ]*) +(.*)\} *\)/{\1\4\2 \6}/;  ## +(\[
# #    # right-binarize PPs after right context reduced to nil
# #    s/{(PP)[a-z]*([^ ]*) +(.*)<(ADVP|RB)([^ ]* [^>]*)> *<(IN)([a-z]*)([^ ]* [^>]*)>(.*)}/\(\1\7\2 \3\{\6\7 <\4\5> <\6\7\8>\}\9\)/; ##(<.*)<



debug($step, " / $_");


    # NPs first project to left children (with external punct, internal punct, or no punct)
    s/^([ \)>\]]*[^\)>\]]*){(NP[^ ]*) *<([^ ]*)([^>]*)> *<, ,> *<([^- ]*)([^ ]*-h[^>]*)>(.*)}/\1\(\2 \{\5-h <\3-mc-lc [\3-h\4] [, ,]> <\5\6>\}\7\)/;
    s/([,;\.?!^][ \)>\]]*[^\)>\]]*){(NP[^ ]*) *<([^ ]*)([^>]*)> *<, ,> *<([^- ]*)([^ ]*-h[^>]*)>(.*)}/\1\(\2 \{\5-h <\3-mc-lc [\3-h\4] [, ,]> <\5\6>\}\7\)/;
    s/{(NP.*) *<, ,> *<([^ ]*)([^>]*)> *<, ,> *<([^- ]*)([^ ]*-h[^>]*)>(.*)}/\(\1 <, ,> \{\4-h <\2-mc-lc [\2-h\3] [, ,]> <\4\5>\}\6\)/;
    s/{(NP.*)<([^ ]*)([^>]*)> *<([^- ]*)([^ ]*-h[^>]*)>(.*)}/\(\1\{\4-h <\2\3> <\4\5>\}\6\)/;
    # then project to right children (with external punct, internal punct, or no punct)
    s/{(NP.*)<([^- ]*)([^ ]*-h[^>]*)> *<, ,> *<([^ ]*)([^>]*)> *}([ \)>\]]*[^\)>\]]*[,;\.?!])/\(\1\{\2-h <\2\3> <\4-mc-rc [, ,] [\4-h\5]>\}\)\6/;
    s/{(NP.*)<([^- ]*)([^ ]*-h[^>]*)> *<, ,> *<([^ ]*)([^>]*)> *<, ,>(.*)}/\(\1\{\2-h <\2\3> <\4-mc-rc [, ,] [\4-h\5]>\} <, ,>\6\)/;
    s/{(NP.*)<([^- ]*)([^ ]*-h[^>]*)> *<([^ ]*)([^>]*)>(.*)}/\(\1\{\2-h <\2\3> <\4\5>\}\6\)/;

    # everything else first project to right children (with external punct, internal punct, or no punct)
    s/{((?!NP).*)<([^- ]*)([^ ]*-h[^>]*)> *<, ,> *<([^ ]*)([^>]*)> *}([ \)>\]]*[^\)>\]]*[,;\.?!])/\(\1\{\2-h <\2\3> <\4-mc-rc [, ,] [\4-h\5]>\}\)\6/;
    s/{((?!NP).*)<([^- ]*)([^ ]*-h[^>]*)> *<, ,> *<([^ ]*)([^>]*)> *<, ,>(.*)}/\(\1\{\2-h <\2\3> <\4-mc-rc [, ,] [\4-h\5]>\} <, ,>\6\)/;
    s/{((?!NP).*)<([^- ]*)([^ ]*-h[^>]*)> *<([^ ]*)([^>]*)>(.*)}/\(\1\{\2-h <\2\3> <\4\5>\}\6\)/;
    # then project to left children (with external punct, internal punct, or no punct)
    s/^([ \)>\]]*[^\)>\]]*){((?!NP)[^ ]*) *<([^ ]*)([^>]*)> *<, ,> *<([^- ]*)([^ ]*-h[^>]*)>(.*)}/\1\(\2 \{\5-h <\3-mc-lc [\3-h\4] [, ,]> <\5\6>\}\7\)/;
    s/([,;\.?!^][ \)>\]]*[^\)>\]]*){((?!NP)[^ ]*) *<([^ ]*)([^>]*)> *<, ,> *<([^- ]*)([^ ]*-h[^>]*)>(.*)}/\1\(\2 \{\5-h <\3-mc-lc [\3-h\4] [, ,]> <\5\6>\}\7\)/;
    s/{((?!NP).*) *<, ,> *<([^ ]*)([^>]*)> *<, ,> *<([^- ]*)([^ ]*-h[^>]*)>(.*)}/\(\1 <, ,> \{\4-h <\2-mc-lc [\2-h\3] [, ,]> <\4\5>\}\6\)/;
    s/{((?!NP).*)<([^ ]*)([^>]*)> *<([^- ]*)([^ ]*-h[^>]*)>(.*)}/\(\1\{\4-h <\2\3> <\4\5>\}\6\)/;


#     # pass up to|vbg|vbn from VB to VP
#     s/{(VP)(?=[- ])(.*)<(VP|VB[A-Z]*|TO)([a-z]*)([^ ]*-h.*)>(.*)}/{\1\4\2<\3\4\5>\6}/;
#     # pass up to|vbg|vbn from IN to PP
#     s/{(PP|SBAR)(?=[- ])(.*)<(IN|TO)([a-z]*)([^ ]*-h.*)>(.*)}/{\1\4\2<\3\4\5>\6}/;
#      # pass up to from VP to S
#     s/{(S)(?=[- ])(.*)<(VP)(to)([^ ]*-h.*)>(.*)}/{\1\4\2<\3\4\5>\6}/;
# #     s/{(S)(-.*)? +(<NP.*) +(<VPto.*)}/{\1to\2 \3 \4}/;


    # undo self-projection (but keep mode tag)
    s/{([^- a-z]*)[a-z]*([^ ]*) *<[^- a-z]*([a-z]*)[^ ]*-h[^ ]*([^>]*)> *}/{\1\3\2\4}/;

debug($step, " \\ $_");

    # S variants can only be to|proj|imp
    s/{(S[A-Z]*)((?!to)(?!pro)(?!imp)(?!that)[a-z]*)(.*)}/{\1\3}/;
    # PP can only be of
    s/{(PP)((?!of)[a-z]*)(.*)}/{\1\3}/;

    # undo any repetition of head tags
    while ( s/{([^ ]*)-h([^ ]*)-h([^ ]*) (.*)}/{\1-h\2\3 \4}/ ) {}


    #### TERMINAL SYMBOLS
    # propagate unary head pos at terminal
    s/{(NP)[a-z]*([^ ]*) +<(NN[A-Z]*)([a-z]*)([^ ]*) +([^<>]*)> *}/{\1\2 <\3\5 \6>}/;
    # propagate unary head pos at terminal
    s/{(VP)[a-z]*([^ ]*) +<(VB[A-Z]*)([a-z]*)([^ ]*) +([^<>]*)> *}/{\1\4\2 <\3\4\5 \6>}/;
    # propagate unary head pos at terminal
    s/{(ADJP)[a-z]*([^ ]*) +<(JJ[A-Z]*)([a-z]*)([^ ]*) +([^<>]*)> *}/{\1\2 <\3\5 \6>}/;
    # propagate unary head pos at terminal
    s/{(ADVP)[a-z]*([^ ]*) +<(RB[A-Z]*)([a-z]*)([^ ]*) +([^<>]*)> *}/{\1\2 <\3\5 \6>}/;
    # undo unary identity projection
    s/{([^ ]*) +<\1([^ ]*) ([^>]*)> *}/{\1\2 \3}/;

#    # remove unary children
#    s/{([^ ]*) +<[^ ]* +([^<>]*)> *}/{\1 \2}/;

#    # grab punct at end of any consitit
#    s/{([^ ]*) +(.*<.*<.*) <(\,|\.|\!|!question!)([^>]*)> *}/\(\1 \{\1 \2\} <\3\4>\)/;

    s/{(.*)<(CD [^>]*)> *<(CD [^>]*)>(.*)}/\(\1\{CD <\2> <\3>\4\}\)/;
    s/{(.*)<(RB [^>]*)> *<(QP [^>]*)>(.*)}/\(\1\{QP <\2> <\3>\4\}\)/;



#     #### VERB PHRASES (EXPERIMENTAL FLAT VPS)
#     # right-binarize VPs after right context reduced to nil
#     s/{(VP)[a-z]*([^ ]*) +(.*)<(ADVP|RB|PP)([^ ]* [^>]*)> *<(VB[A-Z]*|VP)([a-z]*)([^ ]* [^>]*)>( *)}/\(\1\7\2 \3\{\6\7 <\4\5> <\6\7\8>\}\9\)/; ##(<.*)<
#     # right-binarize VPs after right context does not contain normal constituents (i.e. contains punct like quote)
#     s/{(VP)[a-z]*([^ ]*) +(.*)<(ADVP|RB|PP)([^ ]* [^>]*)> *<(VB[A-Z]*|VP)([a-z]*)([^ ]* [^>]*)>( *<[^A-Z].*)}/\(\1\7\2 \3\{\6\7 <\4\5> <\6\7\8>\}\9\)/; ##(<.*)<
#     # NEW!
#     s/{(VP) +<((?=VB|BES|TO|VP)[A-Z]*)([a-z]*)([^>]*)>(.*)}/{\1\3 <\2\3\4>\5}/;
# #print "A: $_";
#     s/{(VP[^ ]*) +<((?=VB|BES)[^- ]*)([^>]*)> +<((?=S(?!BAR)|SBARthat|NP|ADJP|VP|PRT)[^- ]*)([^>]*)> *}/{\1 <\2-b\4\3> <\4\5>}/;
#     s/{(VP[^ ]*) +<((?=VB|BES)[^- ]*)([^>]*)> +(.*)<((?=S(?!BAR)|SBARthat|NP|ADJP|VP|PRT)[^- ]*)([^ ]*)([^>]*)> +<((?=S(?!BAR)|SBARthat|NP|ADJP|VP|PRT)[^- ]*)([^ ]*)([^>]*)>}/\($1 <$2-b$5-b$8$3> $4\{AdvP\*_$5$6_AdvP\*_$8$9_AdvP\* <$5$6$7> <$8$9${10}>\}\)/;
#     s/{(VP[^ ]*) +<((?=VB|BES)[^- ]*)([^>]*)> +(.*)<((?=S(?!BAR)|SBARthat|NP|ADJP|VP|PRT)[^- ]*)([^ ]*)([^>]*)> +<(AdvP\*[^ ]*)([^>]*)> *}/\(\1 <\2-b\5\3> \4\{AdvP\*_\5\6_\8 <\5\6\7> <\8\9>\}\)/;
#     s/{(VP[^ ]*) +<((?=VB|BES)[^- ]*)([^>]*)> +(.*)<((?=S(?!BAR)|SBARthat|NP|ADJP|VP|PRT)[^- ]*)([^ ]*)([^>]*)> +<([^ ]*)([^>]*)> *}/\(\1 <\2-b\5\3> \4\{AdvP\*_\5\6_AdvP\* <\5\6\7> <\8\9>\}\)/;
#     s/{(VP[^ ]*) +<((?=VB|BES)[^- ]*)([^>]*)> +(.*)<([^ ]*)([^>]*)> +<((?=S(?!BAR)|SBARthat|NP|ADJP|VP|PRT)[^- ]*)([^ ]*)([^>]*)>}/\(\1 <\2-b\7\3> \4\{AdvP\*_\7\8_AdvP\* <\5\6> <\7\8\9>\}\)/;
#     s/{(VP[^ ]*) +<((?=VB|BES)[^- ]*)([^>]*)> +(.*)<([^ ]*)([^>]*)> +<(AdvP\*[^ ]*)([^>]*)> *}/\(\1 <\2\3> \4\{\7 <\5\6> <\7\8>\}\)/;
#     s/{(VP[^ ]*) +<((?=VB|BES)[^- ]*)([^>]*)> +(.*)<([^ ]*)([^>]*)> +<([^ ]*)([^>]*)> *}/\(\1 <\2\3> \4\{AdvP\* <\5\6> <\7\8>\}\)/;
# #print "B: $_\n";

    ####################

    ## convert inner angles (if any) to bracks...
    while ( s/{(.*)<([^>]*)>(.*)}/{\1\[\2\]\3}/ ){}
    ## convert outer braces to angles...
    $_ =~ s/{(.*)}/<\1>/;
  }
  ## finish up...
  $_ =~ s/</[/;
  $_ =~ s/>/]/;
  ## translate to parens again...
  $_ =~ s/\[/\(/g;
  $_ =~ s/\]/\)/g;


  $_ =~ s/____INTJ/INTJ/g;


#  ## remove `punctuation-delimited constituent' tags (??)
#  s/-.dlt//g;

# WS: COMMENTED OUT B/C WRECKING COLON PRE_PROC (COMMENT BACK IN AND DO REGRESSION TEST)
#  ## unshift colons (for SRL) and dashes
#  $_ =~ s/!colon!!colon!/:!colon!/g;
#  $_ =~ s/!colon!([^ !]+)/:\U\1/g;
#  #$_ =~ s/:([^ ]*)\!dash!(.*)/:\1--\2/g;
#  #$_ =~ s/:([^ ]*)\!dash!(.*)/:\1--\2/g;
#  $_ =~ s/RELDASH/-/g;
#  $_ =~ s/:REL-([^\.]+)\./:REL-\L\1./g;

#  s/\!semi\! \!semi\!/, \!semi\!/g;

  print $_;
}
