###############################################################################
##                                                                           ##
## This file is part of ModelBlocks. Copyright 2009, ModelBlocks developers. ##
##                                                                           ##
##    ModelBlocks is free software: you can redistribute it and/or modify    ##
##    it under the terms of the GNU General Public License as published by   ##
##    the Free Software Foundation, either version 3 of the License, or      ##
##    (at your option) any later version.                                    ##
##                                                                           ##
##    ModelBlocks is distributed in the hope that it will be useful,         ##
##    but WITHOUT ANY WARRANTY; without even the implied warranty of         ##
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          ##
##    GNU General Public License for more details.                           ##
##                                                                           ##
##    You should have received a copy of the GNU General Public License      ##
##    along with ModelBlocks.  If not, see <http://www.gnu.org/licenses/>.   ##
##                                                                           ##
###############################################################################


# cat wsj_0001.trees | perl scripts/treesed.pl
use Getopt::Std;

getopts("p");

$remove_punct = 0;
if($opt_p){
  $remove_punct = 1; 
}

## for each tree...
while ( <> ) {

  if($remove_punct == 1){
      # Remove/change punctuation...
      s/\([^ ]+ \.\.\.\)//g;
      s/\([^ ]+ \.\)//g;
      s/\([^ ]+ \!\)//g;
      s/\([^ ]+ \?\)//g;
      s/\([^ ]+ \,\)//g;
      s/\([^ ]+ *\([^ ]+ *\-\-\) *\)//g;  ## dash is nested... don't know why
      s/\([^ ]+ \-\-\)//g;
      s/\([^ ]+ \-\)//g;
      s/\([^ ]+ \;\)//g;
      s/\([^ ]+ \:\)//g;
      s/\([^ ]+ \`\)//g;
      s/\([^ ]+ \'\)//g;
      s/\([^ ]+ \`\`\)//g;
      s/\([^ ]+ \'\'\)//g;
      s/\([^ ]+ -L.B-\)//g;
      s/\([^ ]+ -R.B-\)//g;
  }

  # Get rid of typos (categories starting with ^)
  s/\(\^/\(/g;

  # Distinguish punctuation...
  s/\([^ ]+ \.\.\.\)/\(\, \.\.\.\)/g;
  s/\([^ ]+ \?\)/\(\? \?\)/g;
  s/\([^ ]+ *\([^ ]+ *\-\-\) *\)/\(\, \-\-\)/g;  ## dash is nested... don't know why
  s/\([^ ]+ \-\-\)/\(\, \-\-\)/g;
  s/\([^ ]+ \-\)/\(\, \-\-\)/g;
  s/\([^ ]+ \:\)/\(\, \:\)/g;
  s/\([^ ]+ \;\)/\(\, \;\)/g;
  s/\([^ ]+ \`\)/\(\` \`\)/g;
  s/\([^ ]+ \'\)/\(\' \'\)/g;

  ########## GLOBAL SYMBOL SHIFT
  #s/\!/\!exclamation\!/g;
  s/\~/\!tilde\!/g;
  #s/\`\`/\!openquote\!/g;
  #s/\`/\!openscare\!/g;
  s/\@/\!at\!/g;
  s/\#/\!pound\!/g;
  #s/\$/\!dollar\!/g;
  #s/\%/\!percent\!/g;
  #s/\^/\!carat\!/g;
  s/\&/\!ampersand\!/g;
  #s/\*/\!star\!/g;
  s/--/\!dash\!/g;
  s/\+/\!plus\!/g;
  #s/\=/\!equals\!/g;
  s/:/\!colon\!/g;
  s/;/\!semi\!/g;
  #s/\"/\!dblquote\!/g;
  #s/\'\'/\!closequote\!/g;
  #s/\'/\!closescare\!/g;
  #s/\./\!period\!/g;
  #s/\,/\!comma\!/g;
  s/([^\*])\?/\1\!question\!/g;
  s/\//\!slash\!/g;
  ##########

  print $_;
}
