###############################################################################
##                                                                           ##
## This file is part of ModelBlocks. Copyright 2009, ModelBlocks developers. ##
##                                                                           ##
##    ModelBlocks is free software: you can redistribute it and/or modify    ##
##    it under the terms of the GNU General Public License as published by   ##
##    the Free Software Foundation, either version 3 of the License, or      ##
##    (at your option) any later version.                                    ##
##                                                                           ##
##    ModelBlocks is distributed in the hope that it will be useful,         ##
##    but WITHOUT ANY WARRANTY; without even the implied warranty of         ##
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          ##
##    GNU General Public License for more details.                           ##
##                                                                           ##
##    You should have received a copy of the GNU General Public License      ##
##    along with ModelBlocks.  If not, see <http://www.gnu.org/licenses/>.   ##
##                                                                           ##
###############################################################################

################################################################################
#
#  Macros & variables
#
################################################################################

SHELL = /bin/bash
INCLUDES = -Iinclude -I../rvtl/include -I../slush/include #-I/sw/include #-I/Users/dingcheng/Documents/boost/boost_1_44_0
CFLAGS = $(INCLUDES) -Wall `cat user-cflags.txt` -g #-DNDEBUG -O3 #-DNOWARNINGS #-g #
CC = g++ 
LD = g++

comma = ,

PROPTXT       = /project/nlp/data/propbank/propbank-1.0/prop.txt
READINGDATA   = /project/nlp/data/readingtimes

SWBDTRAINSECTS = 2 3 EOS
WSJTRAINSECTS  = 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 EOS
#WSJTRAINSECTS  = 02
WSJHELDOUTSECTS  = 00
BROWNTRAINSECTS = cf cg ck cl cm cn cp cr

include $(wildcard */*.d)       ## don't comment out, it breaks make!

.SECONDEXPANSION:


################################################################################
#
#  User-specific parameter files (not shared; created by default with default values)
#
#  These parameter files differ from user to user, and should not be checked in.
#  This script just establishes 'official' default values for these parameters.
#
################################################################################

#### c++ compile flags
user-cflags.txt:   ## -g
	echo '-DNDEBUG -O3' > $@

#### location of treebank
include user-treebank-location.txt
user-treebank-location.txt:
	echo 'USER_TREEBANK_LOCATION = /project/nlp/data/treebank' > $@
	@echo ''
	@echo 'ATTENTION: I had to create "user-treebank-location.txt" for you, which may be wrong'
	@echo 'edit it to point at your treebank repository and re-run make to continue!'
	@echo ''

#### includes for user sub-projects (default includes are all commented out)
include user-subproject-includes.txt
user-subproject-includes.txt:
	echo '#include spliteval.mk' >  $@
	echo '#include swbd.mk'      >> $@
	echo '#include hdwd.mk'      >> $@
	#echo '#include srl.mk'       >> $@
	echo '#include coref.mk'     >> $@


################################################################################
#
#  Code compilation
#
################################################################################

#### by default build only the semantic role labeling parser, for some reason
all: bin/hhmmparser-srl


#### bin directory (ignored by git b/c empty)
bin:
	mkdir $@


#### c++ dependencies
.PRECIOUS: %.d
%.d: %.cpp
	echo '$*.d: \' > $@   #' ##
	echo `$(CC) $(INCLUDES) -MM $<` | sed 's/^.*:.*\.cpp */ $$(wildcard /;s/\\ *//g;s/$$/)/' >> $@
	cat $@ | sed 's/\.d:/\.cpp:/' >> $@
#	$(CC) -MM $< | sed 's/^.*://' >> $@
#### ocaml dependencies
%.d: %.ml
	echo '$@ : \' > $@   #' ##
	echo `grep '#load' $< | sed 's/.*\"\(.*\)\".*/\1/' | grep -v 'cmxa'` | sed 's/\.ml/\.cmx/g' >> $@
	echo '$< : \' >> $@   #' ##
	echo `grep '#load' $< | sed 's/.*\"\(.*\)\".*/\1/' | grep -v 'cmxa'` | sed 's/\.ml/\.cmx/g' >> $@


#### c++ executables
.PRECIOUS: bin/%
bin/%: src/%.cpp src/%.d user-cflags.txt | bin
	$(CC) $(CFLAGS) -lm $< -o $@
#	$(CC) $(CFLAGS) -L/sw/lib/ -lboost_thread -lm $< -o $@
#	$(CC) $(CFLAGS) -L/Users/dingcheng/Documents/boost/boost_1_44_0/stage/lib -lboost_thread -lm $< -o $@
#### ocaml executables
%.cmx: %.ml %.d
	ocamlopt -I scripts `grep '#load' $< | sed 's/.*\"\(.*\)\".*/\1/' | sed 's/\.ml/\.cmx/g'` $< -c $@
bin/%: scripts/%.ml scripts/%.d | bin
	ocamlopt -I scripts `grep '#load' $< | sed 's/.*\"\(.*\)\".*/\1/' | sed 's/\.ml/\.cmx/g'` $< -o $@
#### cython executables
%.c: %.py
	cython --embed $< -o $@
bin/%: scripts/%.c
	gcc  -lpython2.5 -I /Library/Frameworks/Python.framework/Versions/2.6/include/python2.6/ $< -o $@
#### java executable objects
%.class: %.java 
	javac $<


#### GPL packages from other authors included in modelblocks
bin/evalb: src/evalb.c | bin
	gcc -Wall -g -o $@ $<
stanford-tools.jar: # for the parser and for tree surgeon
	javac ../edu/stanford/nlp/*/*.java ../edu/stanford/nlp/*/*/*.java ../edu/stanford/nlp/*/*/*/*.java
	cd ../ ;  jar -cf wsjparse/stanford-tools.jar edu/ ; cd wsjparse


################################################################################
#
#  HTML pre-processing
#
################################################################################

#### obtain html from wikipedia
srcmodel/wikipedia_%.html:
	wget  en.wikipedia.org/wiki/$*  -O $@
#	wget  en.wikipedia.org/wiki/$(notdir $*)  -O $@

#### obtain txt from html by recursively removing matched pairs of markup (except for those containing the article), then unpaired markup
%.txt: %.html
	cat $< | tr -d '\n' | perl -pe 's/^.*<body[^>]*>(.*)<.body>.*$$/\1/g;  s/<p>/ /g;  s/<(a) ?.*?>(.*?)<\/\1>/\2/g;  s/<(b|i) ?.*?>(.*?)<\/\1>/\2/g;  while ( s/<(?!div id="bodyContent")([a-z0-9]*)[ >](.(?!<\1))*?<\/\1>//g ){};  s/<[^>]*>//g;  s/^[ \t]*(.*?)[ \t\n]*$$/\1/g;' | perl -pe "s/&#160;/ /g;  s/—/--/g" > $@

%.sents: %.txt
	cat $< | perl -pe "s/\"(?! )(?!\))/\`\`/g;  s/(?<! )\"/''/g" | perl -pe "s/(?<! c)(?<! ca)([\.!?])(\)|'')? */ \1 \2\n/g;  s/(\(|\`\`)(?! )/\1 /g;  s/(?<! )(\)|,|;|:|'')(?![0-9])/ \1/g;  y/[A-Z]/[a-z]/"  |  perl -pe "s/\(/\!lrb\!/g;  s/\)/\!rrb\!/g;  s/;/\!semi\!/g;  s/:/\!colon\!/g"  >  $@

################################################################################
#
#  Corpus pre-processing
#
################################################################################

#### parameters fixed in make   NOTE: THIS SHOULD BE A FILENAME PARAMETER!!!
SRL_CCOUNTCUTOFF = 10    # min number of constituent category tokens in training set to establish a type -- usually 10


#### genmodel directory (ignored by git b/c empty)
genmodel:
	mkdir $@

#### trees files, with one tree per line, extracted from various treebanks
genmodel/wsj%.trees: user-treebank-location.txt $(USER_TREEBANK_LOCATION)/parsed/mrg/wsj/% scripts/tbtrees2trees.pl | genmodel  ##scripts/propParent.rb
#	make genmodel
	cat $(USER_TREEBANK_LOCATION)/parsed/mrg/wsj/$*/*.mrg | perl scripts/tbtrees2trees.pl > $@
.PRECIOUS: genmodel/swbd%.trees
genmodel/swbd%.trees: $(USER_TREEBANK_LOCATION)/parsed/mrg/swbd/% scripts/removePunct.sed stanford-tools.jar ##scripts/propP
	cat $</*.mrg | sed -f scripts/removePunct.sed > $@
	clean-tool/new-clean $@
genmodel/fswbd%.trees: $(USER_TREEBANK_LOCATION)/parsed/mrg/swbd/% scripts/removePunct.sed stanford-tools.jar ##scripts/propP
	cat $</*.mrg | sed -f scripts/removePunct.sed > $@
	clean-tool/super-clean $@
genmodel/swbd%.dps: $(USER_TREEBANK_LOCATION)/dysfl/dps/swbd/%
	cat $</*.dps > $@

genmodel/swbd-lex%.lextrees: genmodel/swbd.trees
	cat $< | ./scripts/lexicalizeVerbs.sh $* > $@

genmodel/swbd-lex%.bintrees: $$(word 1,$$(subst -, ,$$@)).lextrees
#	make genmodel/swbd-lex$(word 1,$(subst -, ,$*)).lextrees
	cat genmodel/swbd-lex$(word 1,$(subst -, ,$*)).lextrees | perl -p -e 's/\(S1 (.*)\)/\1/' | ruby scripts/buildModel.rb -$(word 2,$(subst -, ,$*)) | perl scripts/treebinarize.pl | ruby scripts/buildModel.rb -$(word 3,$(subst -, ,$*)) > $@

genmodel/fswbd-lex%.lextrees: genmodel/fswbd.trees
	cat $< | ruby scripts/lexicalizeVerbs.rb $* > $@

genmodel/fswbd-lex%.bintrees: $$(word 1,$$(subst -, ,$$@)).lextrees
#	make genmodel/fswbd-lex$(word 1,$(subst -, ,$*)).lextrees
	cat genmodel/fswbd-lex$(word 1,$(subst -, ,$*)).lextrees | perl -p -e 's/\(S1 (.*)\)/\1/' | ruby scripts/buildModel.rb -$(word 2,$(subst -, ,$*)) | perl scripts/treebinarize.pl | ruby scripts/buildModel.rb -$(word 3,$(subst -, ,$*)) > $@

genmodel/brown%.trees: user-treebank-location.txt $(USER_TREEBANK_LOCATION)/parsed/mrg/brown/% scripts/tbtrees2trees.pl
	cat $(USER_TREEBANK_LOCATION)/parsed/mrg/brown/$*/*.mrg | perl scripts/tbtrees2trees.pl > $@


#### obtain EOS trees file from srcmodel (if exists)
genmodel/%EOS.trees: srcmodel/EOS.trees scripts/tbtrees2trees.pl
	cat $< | perl scripts/tbtrees2trees.pl > $@
#### obtain other trees file from srcmodel (if exists)
genmodel/%.trees: srcmodel/%.trees scripts/tbtrees2trees.pl
	cat $< | perl scripts/tbtrees2trees.pl > $@


#### obtain head-annotated trees (via magerman-black rules)
.PRECIOUS: %.headtrees
%.headtrees: %.trees scripts/trees2headtrees.pl
	cat $< | perl $(word 2,$^) > $@


#### obtain binarized trees: punctuation removed
%-np.bintrees: %.trees scripts/treebinarize.pl
	cat $< | perl scripts/treebinarize.pl -p > $@
#### obtain binarized trees: punctuation retained
.PRECIOUS: %-pu.bintrees
%-pu.bintrees: %.trees scripts/treebinarize.pl
	cat $< | perl scripts/treebinarize.pl > $@



#### obtain argument-annotated trees (annotated with argument subcategorization)
.PRECIOUS: %.argtrees
%.argtrees: %.bintrees scripts/propArgs.rb
	cat $< | ruby scripts/propArgs.rb > $@


#### obtain gap-annotated trees (annotated with HPSG-like extractions instead of GB-like traces)
.PRECIOUS: %.gaptrees
%.gaptrees: %.argtrees scripts/propGaps.pl scripts/propPunct.pl
	cat $< | perl scripts/propGaps.pl | perl scripts/propPunct.pl > $@


#### obtain chomsky normal form trees without lowercase-preterms by adding underscore nonterminals
.PRECIOUS: %.ucnftrees
%.ucnftrees: %.gaptrees scripts/trees2cnftrees.pl
	cat $< | perl -ne 's/\((\w+\!semdelim\!)?([\w\.\-]+\!pbrdelim\!)?([^ ]*) *([^()#]*) *\)/(\1\2\3 \3\#\4)/g && print' | perl scripts/trees2cnftrees.pl > $@


#### obtain corpus with no rare (singleton) words; substitute literal "unk" for singleton words
%-unk.ucnftrees:  %.ucnftrees  scripts/elim-rare-words.py
	cat $<  |  python  $(word 2,$^)  >  $@


#### obtain corpus with no rare (singleton) categories (mostly un-projected underscore cats)
%-nr.ucnftrees:  %.ucnftrees  scripts/elim-rare-cats.py
	cat $<  |  python  $(word 2,$^)  >  $@


#### obtain variable-annotated trees by replacing right-propagated tags with %'s (to avoid uninformed speculation during incremental processing)
.PRECIOUS: %-v.ucnftrees
%-v.ucnftrees: %.ucnftrees scripts/propVars.pl
	cat $< | perl $(word 2,$^) > $@


#### obtain headword-limited cnf trees from unconverted cnf trees, using count cutoff paramenter following '-hw' delimiter (e.g. wsjTRAIN-pu-hw1000)
%.ucnftrees: $$(wordlist 2,$$(words $$(subst -hw, ,$$@)),- $$(subst -hw, ,$$@)).ucnftrees  scripts/calcHdwdTree.rb
	cat  $<  |  ruby $(word 2,$^)  -b $(word 2,$(subst -hw, ,$*))  |  sed 's/( /(/g;s/ )/)/g;s/ )/)/g'  |  grep -v '^$$'  >  $@


#### obtain cnf trees with N rarest cats removed (given by -ccN)
%.ucnftrees: $$(wordlist 2,$$(words $$(subst -cc, ,$$@)),- $$(subst -cc, ,$$@)).ucnftrees  bin/expand-mod-relns  bin/remove-rare-cats
	cat  $<  |  sed 's/(\([^hm][^ )(]*\)/(h:\1/g'  |  $(word 2,$^)  |  $(word 3,$^)  $(word 2,$(subst -cc, ,$*))  >  $@


#### chomsky normal form trees annotated with head/modifier relation labeles
# %hm.ucnftrees: %.ucnftrees scripts/calcHdwdTree.rb
# 	cat $< | ruby scripts/calcHdwdTree.rb -o -n | sed 's/(\([^ \(\)][^ \(\)]*\)\* \([^ \(\)][^ \(\)]*\)#/(\1* \2*#/g' | sed 's/( /[/g;s/ )/]/g' | sed 's/\([^\*\(\)]\) /\1\*m /g' | sed 's/\* /\*i /g;s/\*\#/\#/g' | grep -v '^$$' > $@


#### obtain stupid lowercase-preterm form that hhmm needs by lowercasing preterms
.PRECIOUS: %.cnftrees
%.cnftrees:  %.ucnftrees  scripts/lowercasepreterm.rb
	cat $<  |  ruby $(word 2,$^)  >  $@


#### obtain depth-sensitive cnf trees with terminals converted to lowercase, using word count cutoff paramenter following '-cc' delimiter (e.g. wsjTRAIN-pu-hw1000-cc10)
%.dcnftrees:  %.cnftrees  scripts/cnftrees2cedepths.rb
	cat $<  |  ruby $(word 2,$^)  |  grep -v '\^R,5'  >  $@


#%.dcnftrees: $$(word 1,$$(subst -cc, ,%)).hcnftrees  bin/expand-mod-relns   bin/remove-rare-cats                                    scripts/lowercasepreterm.rb        scripts/cnftrees2cedepths.rb
#	cat $< | sed 's/(\([^hm][^ )(]*\)/(h:\1/g' | bin/expand-mod-relns | bin/remove-rare-cats $(word 2,$(subst -cc, ,$*)) | ruby scripts/lowercasepreterm.rb | ruby scripts/cnftrees2cedepths.rb | grep -v '\^R,5' > $@

#%.dcnftrees: $$(word 1,$$(subst -cc, ,%)).cnftrees scripts/cnftrees2cedepths.rb                                       bin/expand-mod-relns   bin/remove-rare-cats
#	cat $< |   ruby                            scripts/cnftrees2cedepths.rb | sed 's/(\([^hm][^ )(]*\)/(h:\1/g' | bin/expand-mod-relns | bin/remove-rare-cats $(word 2,$(subst -cc, ,$*)) > $@

#%.dcnftrees: %.cnftrees scripts/cnftrees2cedepths.rb                                     ###  bin/remove-rare-cats
#	cat $< |   ruby scripts/cnftrees2cedepths.rb > $@
##	cat $< |   ruby scripts/cnftrees2cedepths.rb | sed 's/(\([^hm][^ )(]*\)/(h:\1/g' | bin/remove-rare-cats $(COUNTCUTOFF) > $@
##	cat $< |   ruby scripts/cnftrees2cedepths.rb | sed 's/\^[LR],[0-9]/\^L,1/g' > $@


#### symbol counts
%.symbolcounts: %
	cat $< | bin/indent2 | sed 's/^ *(\([^ ]*\).*$$/\1/' | sort | uniq -c | sort > $@


#### right-corner transformed trees
%.rctrees: %.cnftrees scripts/cnftrees2flattrees.pl scripts/flattrees2rctrees.pl
	cat $< | perl scripts/cnftrees2flattrees.pl | perl scripts/flattrees2rctrees.pl > $@
#### right-corner transformed trees with depth annotated
%.crctrees: %.rctrees scripts/countRights.rb
	cat $< | grep -v '^$$' | ruby scripts/countRights.rb | sed 's/( *\([^ ]*\)/(\1/g' | sed 's/(\([^ ]*\)\^[0-9] *\([^ ]*\)\^[0-9] *)/(\1 \2)/g' > $@
#### statistics for maximum depth
%.maxcrc: %.crctrees
	cat $< | perl -n -e '$$_ =~ s/^[^\^]*\^/ \^/; @S=split(/ [^\^]*\^/); $$r=0; foreach $$s (@S) { $$r=($$s>$$r)?$$s:$$r; } print "$$r\n";' > $@

#### left-corner transformed trees
%.lctrees: %.cnftrees scripts/cnftrees2lcflattrees.pl scripts/lcflattrees2lctrees.pl
	cat $< | perl scripts/cnftrees2lcflattrees.pl | perl scripts/lcflattrees2lctrees.pl > $@
#### left--corner transformed trees with depth annotated
%.clctrees: %.lctrees scripts/countLefts.rb
	cat $< | grep -v '^$$' | ruby scripts/countLefts.rb | sed 's/( *\([^ ]*\)/(\1/g' | sed 's/(\([^ ]*\)\^[0-9] *\([^ ]*\)\^[0-9] *)/(\1 \2)/g' > $@
#### statistics for maximum depth
%.maxclc: %.clctrees
	cat $< | perl -n -e '$$_ =~ s/^[^\^]*\^/ \^/; @S=split(/ [^\^]*\^/); $$r=0; foreach $$s (@S) { $$r=($$s>$$r)?$$s:$$r; } print "$$r\n";' > $@


#### speech repair trees (???)
%.sptrees: %.gaptrees
	cat $^ | sed 's/(\([^ ]*\) *\([^()#]*\) *)/(\1 \1\#\2)/g;s/ \([A-Z\$$]*\)[^ #]*#/ \1#/g' | perl scripts/trees2cnftrees.pl | perl scripts/cnftrees2flattrees.pl | perl scripts/flattrees2rctrees.pl > $@
	#perl -pi -e 's/\[/(/g;s/\]/)/g' $@
	perl -pi -e 's/-[^ ()\/]*\//\//g' $@     ## Removes dashes from slash categories?  why? stanford parser removes them anyways thats why
	#perl -pi -e 's/([A-Z\$$]+[^ ]*)\#([^ \)]+)/(\1 \2)/g' $@ ## FIXME!!
	#perl -pi -e 's/([A-Z\$$]+)[^ ]*\#([^ \)]+)/(\1 \2)/g' train.rctrees
	perl -pi -e 's/ ([A-Z\$$]+[^ #]*)\#([^ \)]+)/ (\1 \2)/g' $@ ## OK FIXED!!
	perl -pi -e 's/\(([^ ]+) \(\1 ([^ )(]+)\)\)/(\1 \2)/g' $@
	perl -pi -e 's/_/UNDERSCORE/g' $@
	perl -pi -e 's/(.*)/(S1 \1)/' $@

#### collections
genmodel/wsjTRAIN.trees: $(foreach sect,$(WSJTRAINSECTS),genmodel/wsj$(sect).trees) ##genmodel/wsjEOS$*trees  ##genmodel/eos.cnftrees
	@echo "WARNING: undertaking major rebuild from '$@'!  Press CTRL-C to abort!"
	@sleep 5
	cat $^ > $@
genmodel/brownTRAIN.trees: $(foreach sect,$(BROWNTRAINSECTS),genmodel/brown$(sect).trees)
	cat $^ > $@
genmodel/swbdTRAIN.trees: $(foreach sect,$(SWBDTRAINSECTS),genmodel/swbd$(sect).trees) ##genmodel/swbdEOS$*trees  ##genmodel/eos.cnftrees
	cat $^ > $@
genmodel/fswbdTRAIN.trees: $(foreach sect,$(SWBDTRAINSECTS),genmodel/swbd$(sect).trees) ##genmodel/swbdEOS$*trees  ##genmodel/eos.cnftrees
	cat $^ > $@
genmodel/swbdTRAIN.dps: genmodel/swbd2.dps genmodel/swbd3.dps
	cat $^ > $@
genmodel/wsj%.pos: genmodel/wsj$*.bintrees
	cat $^ | ./scripts/trees2pos.sh


#### selective tree sets: look for selections in the order they occur below:
#### use only trees with N or greater words
%.trees: $$(wordlist 2,$$(words $$(subst -minwds, ,$$@)),- $$(subst -minwds, ,$$@)).trees
	cat $< | perl -na -e "if (split(/\#/)>$(word 2,$(subst -minwds, ,$*))) {print $$_;}" > $@      ## (note: split >=X would mean num words >=X-1)
#### use only first N trees
%.trees: $$(wordlist 2,$$(words $$(subst -first, ,$$@)),- $$(subst -first, ,$$@)).trees
	head -$(word 2,$(subst -first, ,$*)) $< > $@

#%-first393.trees: %.trees
#	head -393 $< > $@


#### unigram and bigram probabilities
genmodel/%.unigram: genmodel/%.cnftrees
	cat $^ | perl scripts/trees2words.pl -m | perl scripts/relfreq.pl -l | perl scripts/sortbyprob.pl > $@
genmodel/%.bigram: genmodel/%.cnftrees
	cat $^ | perl scripts/trees2words.pl | perl scripts/words2bigrams.pl -c 10 | perl scripts/relfreq.pl -l | perl scripts/sortbyprob.pl > $@
%.brownnp.ngrampred.csv: genmodel/brownnp.unigram genmodel/brownnp.bigram  # for 1 word per line
	cat $*.sent | perl scripts/sent2ngramprob.pl $^ > $@

#### reading times data
readingdata.sent:
	cat  $(READINGDATA)/reading_time_data_bachrach_cardenas-unix.csv | perl scripts/readingtimes2words.pl > $(READINGDATA)/$@
	perl -pi -e 's/had his own responsibility it was/had his own responsibility\nit was/' $@ # a long compounds sentence, probably split by punctuation
	ln -s $(READINGDATA)/$@
##readingdata.one.sent: $(READINGDATA)/reading_time_data_bachrach_cardenas-unix.csv
readingdata.words:
	cat  $(READINGDATA)/reading_time_data_bachrach_cardenas-unix.csv | perl scripts/readingtimes2words.pl -l > $(READINGDATA)/$@
	ln -s $(READINGDATA)/$@
readingdata.data.csv:
	cat $(READINGDATA)/reading_time_data_bachrach_cardenas-unix.csv | perl -pe 's/(,){23}/,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0/g' | sed 's/,\s\s*,/,0,/g' > $(READINGDATA)/$@
	ln -s $(READINGDATA)/$@
readingdata.closed: readingdata.words closed.class.words.txt
	perl scripts/closedclasswords.pl readingdata.words closed.class.words.txt > $@
closed.class.words.txt: 
	ln -s $(READINGDATA)/closed.class.words.txt
#readingdata.times: $(READINGDATA)/reading_time_data_bachrach_cardenas-unix.csv
#	cat $^ | perl scripts/readingtimes2words.pl -t > $(READINGDATA)/$@
#	ln -s $(READINGDATA)/$@
#readingdata.other: $(READINGDATA)/reading_time_data_bachrach_cardenas-unix.csv
#	perl -n -e 'if (m/^([^,]*,[^,]*,)([^,]*,)([^,]*,[^,]*),(.*)/) {print $$1.length($$2).",".$$3."\n";}' data/readingtimes/reading_time_data_bachrach_cardenas-unix.csv | tail -n +2 > $(READINGDATA)/$@ 
#	ln -s $(READINGDATA)/$@
%.tab: %.csv
	cat $^ | sed 's/,/\t/g' > $@

################################################################################
#
#  Model building
#
################################################################################


#### pcfg counts
.PRECIOUS: %.pcfg.counts
%.pcfg.counts: %.dcnftrees  scripts/trees2rules.pl  scripts/relfreq.pl
	cat $<  |  sed 's/-h\([- \^#]\)/\1/g'  |  perl $(word 2,$^)  |  perl $(word 3,$^) -f  >  $@
#### variable pcfg counts
.PRECIOUS: %.v-pcfg.counts
%.v-pcfg.counts: %-v.dcnftrees  scripts/trees2rules.pl  scripts/relfreq.pl
	cat $<  |  perl $(word 2,$^)  |  perl $(word 3,$^) -f  >  $@
#	cat $<  |  perl $(word 2,$^)  |  sed 's/GG \([^|]*\).*\(\^.*\) :/GG \1\2 :/'  |  sed 's/G : \(.*\)|.*/G : \1/'  |  sed 's/[^ ]*|//g'  |  perl $(word 3,$^)  >  $@

#### part-of-speech model: intermediate tally of POS frequencies
#.PRECIOUS: genmodel/pw.%.tally
#genmodel/pw.%.tally: genmodel/%.cnftrees  scripts/ipModel.pl
#	cat  genmodel/$*.cnftrees | perl -p -e 's/\([^ ]* //g;s/\)//g;s/ /\n/g' | perl scripts/ipModel.pl | sed 's/\(.*\)\#\(.*\)/Pw \2 : \1/' | perl -e 'while(<>){chop;$$T{$$_}++;} foreach $$t(keys %T){print "$$t = $$T{$$t}\n";}' > $@
#### part-of-speech model
#.PRECIOUS: genmodel/POS.%.model
#%.pos.model:  %.pcfg.counts  bin/postrainer  scripts/relfreq.pl
#	cat  $< | sed 's/:[^ ][^ ]*//;s/^Pw *\([^ ]*\) *: *\([^ ]*\)/PwDT \1 : \2/' | bin/postrainer | grep -v "WARNING: distribution Y_prior: no values" > $@
#	cat  genmodel/pw.$*.tally | sed 's/:[^ ][^ ]*//;s/^Pw *\([^ ]*\) *: *\([^ ]*\)/P : \2/' | perl scripts/relfreq.pl >> $@
#### part-of-speech model
.PRECIOUS: genmodel/POS.%.model
%.pos.model:  %.pcfg.counts  bin/postrainer  scripts/relfreq.pl
	cat  $< | sed 's/^Pw/PwDT/' | bin/postrainer > $@

#### gf model
.PRECIOUS: %.gf-hhmm.model
%.gf-hhmm.model:  %.pcfg.counts  scripts/calc-gf-hhmm.py  scripts/sortbyprob.pl  %.pos.model
	cat $<  |  python $(word 2,$^)  |  perl $(word 3,$^)  >  $@
	cat $(word 4,$^)  >>  $@
#### variable gf model
.PRECIOUS: %.afg-hhmm.model
%.afg-hhmm.model:  %.v-pcfg.counts  scripts/get-afg.py  scripts/sortbyprob.pl
	cat $<  |  python $(word 2,$^)  |  perl $(word 3,$^)  >  $@


#### clustering model parameters -- filename `words' delimited by commas (e.g. wsjTRAIN-pu-hw1000-cc3,10,10,1,20,3.model)
# 1st word:  training corpus                          (e.g. wsjTRAIN-pu-hw1000-cc3)
# 2nd word:  for EM: number of EM clusters            (e.g. 10)
# 3rd word:  for EM: number of EM iterations          (e.g. 10)
# 4th word:  for EM: random seed for init values      (e.g. 1)
# 5th word:  for RC-trans: number of value iterations (e.g. 20)
# 6th word:  for POS model: min number of instances   (e.g. 3)
#### obtain initial random model for relational clustering, given depth-annotated cnftrees and params for num clusters (comma word 2) and random seed (comma word 4)
genmodel/md-rlnclust.%.model.insouts.first:  genmodel/$$(word 1,$$(subst $$(comma), ,$$*)).dcnftrees  bin/init-rand-svs-model
	cat  $(word 1,$^)  |  $(word 2,$^)  $(word 2,$(subst $(comma), ,$*))  $(word 4,$(subst $(comma), ,$*)) $(ID_OR_H) >  $@
#### obtain relationally clustered model using inside-outside, given initial random model, depth-annotated cnftrees, params for num clusters (comma word 2), num EM iters (comma word 3)
.PRECIOUS: genmodel/md-rlnclust.%.model.insouts
genmodel/md-rlnclust.%.model.insouts:  $$@.first  genmodel/$$(word 1,$$(subst $$(comma), ,$$*)).dcnftrees  bin/inside-outside  scripts/sortbyprob.pl
	@echo "WARNING: long build for '$@'!  Press CTRL-C to abort!"
	@sleep 5
	cat  $(word 1,$^)  $(word 2,$^)  |  $(word 3,$^)  $(word 2,$(subst $(comma), ,$*))  $(word 3,$(subst $(comma), ,$*))  |  perl  $(word 4,$^)  >  $@
#### obtain log file (equiv to nohup.out??) given initial random model, depth-annotated cnftrees, params for num clusters (comma word 2), num EM iters (comma word 3)
genmodel/md-rlnclust.%.model.insouts.log: $$@.first  genmodel/$$(word 1,$$(subst $$(comma), ,$$*)).dcnftrees  bin/init-rand-svs-model  bin/inside-outside
	cat  $(word 2,$^)  |  $(word 3,$^)  $(word 2,$(subst $(comma), ,$*))  $(word 4,$(subst $(comma), ,$*))  >  $@.first     ## NOTE: should use sortbyprob?
	cat  $(word 1,$^)  $(word 2,$^)  |  $(word 4,$^)  $(word 2,$(subst $(comma), ,$*))  $(word 3,$(subst $(comma), ,$*))  >&  $@
	rm -f $@.first
#### obtain factored syn-sem pcfg model given relationally clustered model, depth-annotated cnftrees, param for min count cutoff (comma word 6)
.PRECIOUS: genmodel/md-rlnclust.%.model
genmodel/md-rlnclust.%.model: genmodel/md-rlnclust.%.model.insouts  genmodel/$$(word 1,$$(subst $$(comma), ,$$*)).dcnftrees  scripts/relfreq.pl
	cat  $(word 1,$^)  |  sed 's/A : /Gr : l 1 /'  >  $@       ## (need this for md parser, but undo this sed for vecmd)
	cat  $(word 2,$^)  |  grep -o '[^ ]* [^ ]*#[^ )]*)'  |  sed 's/([^ ]*:\([^ ]*\){[^ ]* \([^ ]*\)#\([^ ]*\))/Pw \3 : \2 = 1\nPc \1 : \2 = 1\nP : \2 = 1/'  |  perl $(word 3,$^)  |  sort >> $@
	cat  $@  |  grep '^Pw '  |  sed 's/^Pw  *\([^ ]*\).*/W : \1/'  |  perl $(word 3,$^)  -c $(word 6,$(subst $(comma), ,$*))  |  sort  >>  $@
#### vectorized factored syn-sem pcfg model
.PRECIOUS: genmodel/vec%.model
genmodel/vec%.model: genmodel/%.model  genmodel/$$(word 1,$$(subst $$(comma), ,$$(word 2,$$(subst ., ,$$@)))).dcnftrees  bin/get-a-transp
	cat  $(word 1,$^)  $(word 2,$^)  |  sed 's/Gr : l 1 /A : /'  |  grep -v '^[CGPW]'  |  $(word 3,$^)  $(word 2,$(subst $(comma), ,$*))  |  sed 's/^G  *: \(.*\){\(.*\)}/G \1 unk : \2/g;'  |  sort  >  $@.tmp
	cat  $(word 1,$^)  |  sed 's/ROOT{Unk}/ROOT{unk}/g;s/^Gr  *: \(.*\){\(.*\)}/Gr \1 unk : \2/g;s/^M \(.*\){\(.*\)} : \(.*\) =/M \1 \3 \2 : \2 =/g'  >>  $@.tmp
	echo 'M dummy' >> $@.tmp
	cat $@.tmp | grep ^M | awk '{print $$7}' | sort -u | wc -l | sed 's/^ *\([0-9]*\).*/M_SIZE \1/g' > $@
	cat $@.tmp >> $@
	rm -f $@.tmp
#### obtain incremental factored syn-sem hhmm model (intermediate left-progeny: "G*,G+,G0" terms) given params for num value iterations (comma word 5)
genmodel/mlstar.%.model:  genmodel/md-rlnclust.%.model  bin/mlmodel2mlstarmodel
	@echo "WARNING: long build for '$@'!  Press CTRL-C to abort!"
	@sleep 5
	cat  $(word 1,$^)  |  grep -v ' = 0$$'  |  sed 's/Gr : l 1/Gr :/'  |  $(word 2,$^)  $(word 5,$(subst $(comma), ,$*))  >  $@
#### incremental factored syn-sem hhmm model
.PRECOUS: genmodel/%.gif-hhmm.model
genmodel/%.gif-hhmm.model:  genmodel/mlstar.%.model  genmodel/md-rlnclust.%.model  bin/mlstarmodel2gifmodel  scripts/sortbyprob.pl  genmodel/POS.$$(word 1,$$(subst -cc, ,$$*)).model
	cat  $(word 1,$^)  |  $(word 3,$^)  |  perl  $(word 4,$^)  >  $@
	cat  $(word 2,$^)  | grep '^[HP]'  |  perl  $(word 4,$^)  |  sort  |  perl $(word 4,$^)  >>  $@
	cat  $(word 5,$^)  >>  $@


.PRECIOUS: genmodel/gf-t2m.%.model
genmodel/gf-t2m.%.model:
	@echo 'Do you really want to re-build this big $@ file?  If not, CTRL-C and copy it from somewhere!'
	@sleep 5
	make genmodel/$*.crctrees                                                 scripts/trees2dat-gf.rb        scripts/relfreq.pl        scripts/sortbyprob.pl
	cat  genmodel/$*.crctrees | grep -v '\^[5-9]' | sed 's/\^[0-9]//g' | ruby scripts/trees2dat-gf.rb | perl scripts/relfreq.pl | perl scripts/sortbyprob.pl > $@
	make genmodel/pw.$*.tally
	cat  genmodel/pw.$*.tally | perl scripts/relfreq.pl -c $(COUNTCUTOFF) | perl -n -e 'if(s/^Pw ([^ ]+).*/W : \1 = 1.0/){ print }' | sort -u >> $@
	cat  genmodel/pw.$*.tally | perl scripts/relfreq.pl -c $(COUNTCUTOFF) | perl scripts/sortbyprob.pl >> $@

#.PRECIOUS: genmodel/gfbar.%.model
#genmodel/gfbar.%.model:
#	@echo 'Do you really want to re-build this big $@ file?  If not, CTRL-C and copy it from somewhere!'
#	@sleep 5
#	make genmodel/pcfg.$*.model   bin/pcfg2gfbar        scripts/sortbyprob.pl
#	cat  genmodel/pcfg.$*.model | bin/pcfg2gfbar | perl scripts/sortbyprob.pl > $@    # | sort -T /scratch/nlp > $@
.PRECIOUS: genmodel/gf.%.model
genmodel/gf.%.model:
	@echo 'Do you really want to re-build this big $@ file?  If not, CTRL-C and copy it from somewhere!'
	@sleep 5
	make genmodel/pcfg.$*.model   bin/pcfg2gf        scripts/sortbyprob.pl
	cat  genmodel/pcfg.$*.model | grep -v '^G ' | sed 's/^Pc /Pg /g' |  bin/pcfg2gf | perl scripts/sortbyprob.pl > $@    # | sort -T /scratch/nlp > $@
#	make genmodel/RCPCFG.$*.model   bin/rcpcfg2hhmm        scripts/sortbyprob.pl
#	cat  genmodel/RCPCFG.$*.model | bin/rcpcfg2hhmm | perl scripts/sortbyprob.pl > $@    # | sort -T /scratch/nlp > $@

.PRECIOUS: genmodel/trace.%.model
genmodel/trace.%.model:
	make genmodel/gf-t2m.$*.model genmodel/$*.cnftrees genmodel/$*.dps
	cat genmodel/gf-t2m.$*.model | grep -v "^W " > $@
	cat genmodel/pw.$*.tally | sed 's/^Pw  *\([^ ]*\).*: [^ ]* = \([^ ]*\)/W : \1 = \2/' | perl scripts/relfreq.pl -c 10 >> $@
	cat genmodel/$*.dps | ruby scripts/alignRepairs.rb >> $@

.PRECIOUS: genmodel/fp.%.model
genmodel/fp.%.model:
	make genmodel/gf-t2m.$*.model
	cp genmodel/gf-t2m.$*.model $@

############# Elision model
genmodel/unfs.txt: genmodel/swbd2.trees genmodel/swbd3.trees scripts/generateUnfs.rb
	cat genmodel/swbd2.trees genmodel/swbd3.trees | ruby scripts/generateUnfs.rb | grep -v "EDITED" | grep -v "UNF" > $@

genmodel/elisionmodel.txt: genmodel/unfs.txt genmodel/swbd2.trees genmodel/swbd3.trees scripts/buildElisionModel.rb scripts/relfreq.pl scripts/limitProb.rb
	cat $< genmodel/swbd[23].trees | ruby scripts//buildElisionModel.rb | grep -v "EDITED" | grep -v "UNF" | perl scripts/relfreq.pl | ruby scripts/limitProb.rb | sort > $@


################################################################################
#
#  Parser execution
#
################################################################################

#### obtain input sentences
#%.sents: %.trees
#	cat $< | sed 's/([^ ]* //g;s/)//g' | sed 's/  */ /g' > $@
%.sents: %.gold.evalform
	cat $< | sed 's/([^ ]* //g;s/)//g;s/[^ \/]*\#//g' > $@
%.sents: %.cnftrees
	cat $< | sed 's/[^ \#()]*\#/\#/g;s/([^ ]* //g;s/) *//g;s/\#/ /g' > $@

#### obtain model-specific parser output by running sentences through parser given flags and model:
# parser output files -- filename `words' delimited by dots (e.g. wsj22-first393-pu.wsjTRAIN-pu-nr.-b_500.gf-hhmm.output)
# 1st word:  test set + params delimited by dashes     (e.g. wsj22-first393-pu or wsj22-np)
# 2nd word:  training set + params delimited by dashes (e.g. wsjTRAIN-pu-nr or wsjTRAIN-np-hw1000-cc10,10,10,1,20,3 or swbdTRAIN-...)
# 3rd word:  runtime params delimited by underscores   (e.g. -b_500)
# 4th word:  model / output format                     (e.g. gf-hhmm or gif-hhmm or gg-cky)
.PRECIOUS: %.output
%.output:  genmodel/$$(word 1,$$(subst ., ,$$@)).sents  bin/parser-$$(word 4,$$(subst ., ,$$@))  genmodel/$$(word 2,$$(subst ., ,$$@)).$$(word 4,$$(subst ., ,$$@)).model
	@echo "WARNING: long build for '$@'!  Press CTRL-C to abort!"
	@sleep 5
	cat $< | $(word 2,$^) $(subst _, ,$(word 3,$(subst ., ,$*))) $(word 3,$^) > $@

#### obtain cnftrees by converting output using script:
#%.hypoth.cnftrees:  %.output  $$(word 4,$$(subst ., ,$$@))-output-to-cnftrees.pl
#	cat $< | $(word 2,$^) > $@
%-hhmm.hypoth.cnftrees:  %-hhmm.output  scripts/hypoths2rctrees.pl  scripts/rctrees2flattrees.pl  scripts/flattrees2cnftrees.pl
	cat $< | perl $(word 2,$^) | sed 's/{\(e[0-9]*\)}/.\1/g' | perl $(word 3,$^) | perl $(word 4,$^) > $@
%-cky.hypoth.cnftrees:  %-cky.output  scripts/hypoths2rctrees.pl  scripts/rctrees2flattrees.pl  scripts/flattrees2cnftrees.pl
	cat $< | perl $(word 2,$^) | sed 's/{\(e[0-9]*\)}/.\1/g' | perl $(word 3,$^) | perl $(word 4,$^) > $@
# CODE REVIEW: sed just needed b/c perl sandwitch scripts use braces for something else; this should be eliminated





##### run cky parser (obtain parser output file):
## parser output files -- filename `words' delimited by dots (e.g. hhmm.qf.wsjpu.feb27.500.wsj22pu.eval)
## 1st word:  parse or evaluation framework                            (e.g. hhmm [which produces mls output] or cky)
## 2nd word:  implementation + runtime params delimited by underscores (e.g. vecmd-rlnclust)
## 3rd word:  training set                                             (e.g. wsjpu or wsjnp or swbd)
## 4th word:  eval date/id                                             (e.g. feb27 or feb27b)
## pre-final word:  test set        (e.g. wsj22pu or wsj22np)
#.PRECIOUS: cky.%.hypoth.cnftrees
#cky.%.hypoth.cnftrees: 
#	make cky.$*.sents
#	make genmodel/$(word 2,$(subst ., ,$@)).$(word 3,$(subst ., ,$@)).model
#	make genmodel/POS.$(word 3,$(subst ., ,$@)).model
#	make bin/ckyparser-$(word 2,$(subst ., ,$@))
#	@echo 'Do you really want to re-build this big $@ file?  If not, CTRL-C and copy it from somewhere!'
#	@sleep 5
#	cat cky.$*.sents | nice bin/ckyparser-$(word 2,$(subst ., ,$@)) genmodel/$(word 2,$(subst ., ,$@)).$(word 3,$(subst ., ,$@)).model genmodel/POS.$(word 3,$(subst ., ,$@)).model > $@




#%.possents: %.gold.evalform
#	cat $< | perl -pe "s/[()]//g;s/.*?([^ ]+) ([a-z'-][^\s]*)/\2\/\1 /g" | perl -pe "s/^ *//g" | perl -pe 's/ *$$//g' > $@

## REDUNDANT with generalized parser item!
##.PRECIOUS: cky.pcfg.%.hypoth.cnftrees
##cky.pcfg.%.hypoth.cnftrees:
##	make cky.pcfg.$*.sents
##	make genmodel/pcfg.$(word 3,$(subst ., ,$@)).model
##	make genmodel/POS.$(word 3,$(subst ., ,$@)).model
##	make bin/ckyparser
##	@echo 'Do you really want to re-build this big $@ file?  If not, CTRL-C and copy it from somewhere!'
##	@sleep 5
##	cat cky.pcfg.$*.sents | nice bin/ckyparser genmodel/pcfg.$(word 3,$(subst ., ,$@)).model genmodel/POS.$(word 3,$(subst ., ,$@)).model > $@
#.PRECIOUS: cky.ml%.hypoth.cnftrees
#cky.ml%.hypoth.cnftrees:
#	make cky.ml$*.sents
#	make genmodel/$(word 2,$(subst ., ,$@)).$(word 3,$(subst ., ,$@)).model
#	make genmodel/POS.$(word 3,$(subst ., ,$@)).model
#	make bin/ckyparser-ml
#	@echo 'Do you really want to re-build this big $@ file?  If not, CTRL-C and copy it from somewhere!'
#	@sleep 5
#	cat cky.ml$*.sents | nice bin/ckyparser-ml genmodel/$(word 2,$(subst ., ,$@)).$(word 3,$(subst ., ,$@)).$(word 5,$(subst ., ,$@)).model genmodel/POS.$(word 3,$(subst ., ,$@)).model > $(subst cnftrees,dat,$@)
#	rmel $(subst cnftrees,dat,$@) > $@
#.PRECIOUS: cky.vecmd%.hypoth.cnftrees
#cky.vecmd%.hypoth.cnftrees:
#	make genmodel/$(word 2,$(subst ., ,$@)).$(word 3,$(subst ., ,$@)).$(word 5,$(subst ., ,$@)).model
#	make genmodel/POS.$(word 3,$(subst ., ,$@)).model
#	make bin/ckyparser-$(word 2,$(subst ., ,$@))
#	make cky.vecmd$*.sents
#	@echo 'Do you really want to re-build this big $@ file?  If not, CTRL-C and copy it from somewhere!'
#	@sleep 5
#	cat cky.vecmd$*.sents | nice bin/ckyparser-$(word 2,$(subst ., ,$@)) genmodel/$(word 2,$(subst ., ,$@)).$(word 3,$(subst ., ,$@)).$(word 5,$(subst ., ,$@)).model genmodel/POS.$(word 3,$(subst ., ,$@)).model | sed 's/{[^; ]*\([; ]\)/\1/g;s/[hmU-][^: ;]*://g' > $@
#.PRECIOUS: cky.gf.%.hypoth.cnftrees
#cky.gf.%.hypoth.cnftrees:
#	make cky.gf.$*.sents
#	make genmodel/$(word 2,$(subst ., ,$@)).$(word 3,$(subst ., ,$@)).model
#	make genmodel/POS.$(word 3,$(subst ., ,$@)).model
#	make bin/ckyparser
#	@echo 'Do you really want to re-build this big $@ file?  If not, CTRL-C and copy it from somewhere!'
#	@sleep 5
#	cat cky.gf.$*.sents | nice bin/ckyparser genmodel/$(word 2,$(subst ., ,$@)).$(word 3,$(subst ., ,$@)).model genmodel/POS.$(word 3,$(subst ., ,$@)).model > $@
#.PRECIOUS: hhmm.qfbar.%.hypoth.dat
#hhmm.qfbar.%.hypoth.dat:
#	make hhmm.qfbar.$*.sents
#	make genmodel/QF.$(word 3,$(subst ., ,$@)).model
#	make genmodel/POS.$(word 3,$(subst ., ,$@)).model
#	make bin/hhmmparser-qfbar
#	@echo 'Do you really want to re-build this big $@ file?  If not, CTRL-C and copy it from somewhere!'
#	@sleep 5
#	cat hhmm.qfbar.$*.sents | nice bin/hhmmparser-qfbar -b $(word 5,$(subst ., ,$@)) genmodel/QF.$(word 3,$(subst ., ,$@)).model genmodel/POS.$(word 3,$(subst ., ,$@)).model > $@


%.swbd4dev.hypoth.evalform: %.swbd4dev.hypoth.cnftrees scripts/unbinarize.pl scripts/unbuildModel.rb
	cat $< | ruby scripts/uppercasepreterm.rb | sed 's/[^ \/]*\#//g' | perl scripts/unbinarize.pl | perl -p -e 's/(\([A-Z\$$\.\,\!\`\'\'']+)[-a-z\$$]+[-a-zA-Z0-9\$$]*/\1/g' | ruby scripts/unbuildModel.rb -e | sed 's/EDITED[^ ]*/EDITED/g' | perl -p -e 's/([^-])UNF/\1-UNF/g' | perl -pe 's/\(VP([^ ]+) ([^(]+)\)/(VP (\1 \2))/g' | perl -p -e 's/\((WH)?NP([^ ]*) ([^(]+)\)/(\1NP (prp \3))/g' | perl -p -e 's/\(PRT ([^(]+)\)/(PRT (pos \1))/g' | perl -p -e 's/\(INTJ ([^(]+)\)/(INTJ (UH \1))/g' | perl -p -e 's/\(ADVP ([^(]+)\)/(ADVP (RB \1))/g' | perl -p -e 's/\(ADJP ([^(]+)\)/(ADJP (JJ \1))/g' | perl -p -e 's/[A-Z]+\)/)/g' | perl -p -e 's/EDITED ([^(]+)\)/EDITED (pos \1))/' | ruby scripts/unglomIntj.rb > $@
#'

%.swbd4test.hypoth.evalform: %.swbd4test.hypoth.cnftrees scripts/unbinarize.pl scripts/unbuildModel.rb
	cat $< | ruby scripts/uppercasepreterm.rb | sed 's/[^ \/]*\#//g' | perl scripts/unbinarize.pl | perl -p -e 's/(\([A-Z\$$\.\,\!\`\'\'']+)[-a-z\$$]+[-a-zA-Z0-9\$$]*/\1/g' | ruby scripts/unbuildModel.rb -e | sed 's/EDITED[^ ]*/EDITED/g' | perl -p -e 's/([^-])UNF/\1-UNF/g' | perl -pe 's/\(VP([^ ]+) ([^(]+)\)/(VP (\1 \2))/g' | perl -p -e 's/\((WH)?NP([^ ]*) ([^(]+)\)/(\1NP (prp \3))/g' | perl -p -e 's/\(PRT ([^(]+)\)/(PRT (pos \1))/g' | perl -p -e 's/\(INTJ ([^(]+)\)/(INTJ (UH \1))/g' | perl -p -e 's/\(ADVP ([^(]+)\)/(ADVP (RB \1))/g' | perl -p -e 's/\(ADJP ([^(]+)\)/(ADJP (JJ \1))/g' | perl -p -e 's/[A-Z]+\)/)/g' | perl -p -e 's/EDITED ([^(]+)\)/EDITED (pos \1))/' | ruby scripts/unglomIntj.rb > $@
#'

sp.rct.%.hypoth.cnftrees:
	make sp.$*.possents
	make genmodel/$(word 3,$(subst ., ,$@)).sptrees
	java -server -mx1g -cp stanford-tools.jar edu.stanford.nlp.parser.lexparser.LexicalizedParser -PCFG -vMarkov 1 -uwm 0 -headFinder edu.stanford.nlp.trees.LeftHeadFinder -train genmodel/$(word 3,$(subst ., ,$@)).sptrees -tokenized -tagseparator / -sentences newline -outputFormat oneline sp.$*.possents | perl -pe 's/UNDERSCORE/_/g' | perl scripts/rctrees2flattrees.pl | perl scripts/flattrees2cnftrees.pl > $@

sp.cnf.%.hypoth.evalform:
	make sp.$*.possents
	make genmodel/$(word 3,$(subst ., ,$@)).lextrees
	java -server -mx1g -cp stanford-tools.jar edu.stanford.nlp.parser.lexparser.LexicalizedParser -PCFG -vMarkov 1 -uwm 0 -headFinder edu.stanford.nlp.trees.LeftHeadFinder -train genmodel/$(word 3,$(subst ., ,$@)).lextrees -tokenized -tagseparator / -sentences newline -outputFormat oneline sp.$*.possents > $@



#trellview: bin/trellview genmodel/CF.model genmodel/POS.model
#	$^

#trellview-nobar: bin/trellview-nobar genmodel/CF.model genmodel/POS.model
#	$^


################################################################################
#
#  Evaluation
#
################################################################################


#### turn cnftrees into more standard, flatter parse format
.PRECIOUS: %.evalform
%.evalform: %.cnftrees scripts/unbinarize.pl
	cat $< | sed 's/[^ \/]*\#//g' | sed 's/[^\(\) ]*://g;s/\.e[0-9]//g' | perl $(word 2,$^) | perl -p -e 's/(\([A-Z\$$\.\,\!\`\'\'']+)[-a-z\$$]+[-a-zA-Z0-9\$$]*/\1/g' > $@
#' # need this to view rest of make correctly following single-quote trickery above


#### obtain eval by running evaluator on gold and hypoth trees
# eval files same convention as parser output -- filename `words' delimited by dots (e.g. wsj22-first393-pu.wsjTRAIN-pu-nr.-b_500.gf-hhmm.eval)
# 1st word:  test set + params delimited by dashes     (e.g. wsj22-pu or wsj22-np)
# 2nd word:  training set + params delimited by dashes (e.g. wsjTRAIN-pu-nr or wsjTRAIN-np-hw1000-cc10,10,10,1,20,3 or swbdTRAIN-...)
# 3rd word:  runtime params delimited by underscores   (e.g. -b_500)
# 4th word:  model / output format                     (e.g. gf-hhmm or gif-hhmm or gg-cky)
.PRECIOUS: %.eval
%.eval:  user-subproject-includes.txt  bin/evalb  srcmodel/new.prm  genmodel/$$(word 1,$$(subst ., ,%)).evalform  %.hypoth.evalform
	$(word 2,$^) -p $(word 3,$^) $(word 4,$^) $(word 5,$^) > $@


### hhmm.hypoth.evalform
#hhmm%.hypoth.evalform: hhmm%.hypoths scripts/hypoths2rctrees.pl scripts/rctrees2flattrees.pl scripts/flattrees2cnftrees.pl scripts/unbinarize.pl
#	cat $< | perl scripts/hypoths2rctrees.pl | perl scripts/rctrees2flattrees.pl | perl scripts/flattrees2cnftrees.pl | sed 's/[^ \/]*\#//g' | perl scripts/unbinarize.pl | sed 's/\([A-Z]\+\)[-a-z\$$]\+[-a-zA-Z\$$]*/\1/g' > $@

### ckyparser.hypoth.evalform
#ckyparser%.hypoth.evalform: ckyparser%.hypoth.cnftrees scripts/unbinarize.pl
#	cat $< | sed 's/[^ \/]*\#//g' | perl scripts/unbinarize.pl | sed 's/\([A-Z]\+\)[-a-z\$$]\+[-a-zA-Z\$$]*/\1/g' > $@


#### gold standard evalform
#%.swbd4test.gold.evalform: genmodel/swbd4.trees  #### ????????????
#	head -6051 $^ | perl -pe 's/\)\((.*)$$/)\n(\1/' | perl -p -e 's/\(S1 (.*)\)/\1/' > $@
#%.swbd4dev.gold.evalform: genmodel/swbd4.trees
#	tail -300 $^ | perl -p -e 's/\(S1 (.*)\)/\1/' > $@
#%.brown.gold.evalform: genmodel/all.brown.cnftrees
#	tail -300 $^ | perl -p -e 's/\(S1 (.*)\)/\1/' > $@
##%.wsj22pu.gold.evalform: genmodel/wsj22pu.cnftrees
##	cat $^ | head -393 | sed 's/[^ \/]*\#//g' | perl scripts/unbinarize.pl | perl -p -e 's/([A-Z]+)[-a-z\$$]+[-a-zA-Z\$$]*/\1/g' > $@
##%.wsj22np.gold.evalform: genmodel/wsj22np.cnftrees
##	cat $^ | head -393 | sed 's/[^ \/]*\#//g' | perl scripts/unbinarize.pl | perl -p -e 's/([A-Z]+)[-a-z\$$]+[-a-zA-Z\$$]*/\1/g' > $@
##%.gold.evalform: %.cnftrees scripts/unbinarize.pl
##	cat $^ | sed 's/[^ \/]*\#//g' | perl scripts/unbinarize.pl | perl -p -e 's/([A-Z]+)[-a-z\$$]+[-a-zA-Z\$$]*/\1/g' > $@


#### {hhmmparser...,ckyparser...}.eval
%.eval: bin/evalb srcmodel/new.prm $$(word 1,$$(subst ., ,%)).gold.evalform %.hypoth.evalform
	bin/evalb -p srcmodel/new.prm $*.gold.evalform $*.hypoth.evalform > $@

%.edited: %.evalform
	cat $< | ruby scripts/markEdited.rb > $@

%.swbdeval: %.eval scripts/evalEdit.rb scripts/markEdited.rb %.hypoth.evalform %.gold.evalform
	mv $< $@
	cat $*.gold.evalform | ruby scripts/markEdited.rb > eraseme
	cat $*.hypoth.evalform | ruby scripts/markEdited.rb > eraseme2
	ruby scripts/evalEdit.rb eraseme eraseme2 >> $@
	rm -f eraseme eraseme2

### scores
%.scores: %.eval
	cat $< | grep -v '^2 or' | grep -v '^    ' | grep '^[ 0-9]' | perl -na -e 'if ($$F[1]<=40) {print "$$F[0] $$F[3]\n";}' > $@
#	cat $< | grep -v '^2 or' | grep -v '^    ' | grep '^  *[0-9]' | perl -na -e 'if ($$F[1]>40) {$$f = ($$F[3]+$$F[4]==0) ? 0.0 : 2.0*$$F[3]*$$F[4]/($$F[3]+$$F[4]); print "$$F[0] $$f\n";}' > $@
#	cat $< | grep -v '^2 or' | grep -v '^    ' | grep '^[ 0-9]' | perl -na -e 'if ($$F[1]<=40) {$$f = ($$F[3]+$$F[4]==0) ? 0.0 : $$F[1]*2.0*$$F[3]*$$F[4]/($$F[3]+$$F[4]); print "$$F[0] $$f\n";}' > $@


### crctrees eval
#%.pcfg.model: %.cnftrees scripts/trees2rules.pl | scripts/relfreq.pl
#	cat $< genmodel/eos.cnftrees | perl scripts/trees2rules.pl | perl scripts/relfreq.pl | sort > $@
#%.rcpcfg.model: %.pcfg.model bin/pcfg2rcpcfg
#	cat $< | bin/pcfg2rcpcfg | sort > $@
#%.hhmm.model: %.rcpcfg.model bin/rcpcfg2hhmm
#	cat $< | bin/rcpcfg2hhmm | perl scripts/sortbyprob.pl > $@  #| sort -T /scratch/nlp > $@


### random trees from pcfg
.PRECIOUS: genmodel/rand.pcfg.%.cnftrees
genmodel/rand.pcfg.%.cnftrees: genmodel/pcfg.%.model bin/pcfg2randtree
	cat $< | bin/pcfg2randtree > $@

### depth stats
.PRECIOUS: %.depths
%.depths: %.crctrees
	cat $< | ruby scripts/countWords.rb | grep -v AVGWDS | cut -f 2 -d ' ' | sort | uniq -c > $@

### eps graph of depths
%.depths.eps: scripts/plotDepths.sh %.depths
	$< $*

### complexity metric calculation: surprisal, entropy reduction, average depth
.PRECIOUS: test.%.gf.des.out
%.gf.des.csv: %.sent bin/hhmmparser-gf
	cat $< | bin/hhmmparser-gf -des genmodel/gf-t2m.wsjnp.model genmodel/POS.wsjnp.model > tmp
	cat tmp | ruby scripts/noisyout2surprisal.rb -des -1 > $@
test.%.gf.s.out: test.%.sent bin/hhmmparser-gf genmodel/gf.%.model
	cat $< | bin/hhmmparser-gf -s genmodel/gf.$*.model > $@
test.%.gf.s.csv: test.%.gf.s.out scripts/noisyout2surprisal.rb
	cat $< | ruby scripts/noisyout2surprisal.rb -s -1 > $@
test.%.gf.surprisal: test.%.gf.des.out scripts/noisyout2surprisal.rb
	cat $< | ruby scripts/noisyout2surprisal.rb -s > $@
test.%.gf.entropyrdc: test.%.gf.des.out scripts/noisyout2surprisal.rb
	cat $< | ruby scripts/noisyout2surprisal.rb -e > $@
test.%.gf.depth: test.%.gf.des.out scripts/noisyout2surprisal.rb
	cat $< | ruby scripts/noisyout2surprisal.rb -d > $@
test.%.gf.des.out: test.%.sent bin/hhmmparser-gf genmodel/gf.%.model
	cat $< | bin/hhmmparser-gf -des genmodel/gf.$*.model > $@
test.%.gf.des.csv: test.%.gf.s.out scripts/noisyout2surprisal.rb
	cat $< | ruby scripts/noisyout2surprisal.rb -des1 > $@


################################################################################
#
#  Misc utilities
#
################################################################################

grep.%:
	grep $(subst '.',' ',$*) src/*.cpp include/*.h ../rvtl/include/*.h -n

%.memprof: run-%
	valgrind --tool=massif --time-unit=i --max-snapshots=500 --massif-out-file=$@ -v $<
#	ms_print $@ | less

%.procprof: 
	cat user-cflags.txt > user-cflags.tmp.txt
	echo '-DNDEBUG -O3 -pg' > user-cflags.txt
	make $* -B
	gprof $* > $@
	cat user-cflags.tmp.txt > user-cflags.txt

dist-clean:
	@echo 'Do you really want to destroy all models in genmodel?  If not, CTRL-C and copy it from somewhere!'
	@sleep 5
	-rm bin/* genmodel/* */*.o ./*~ ./*~ */*.a */*.cmx */*.d ./semantic.cache pkgmodel/*
clean:
	@echo 'Do you really want to destroy all models in genmodel?  If not, CTRL-C and copy it from somewhere!'
	@sleep 5
	-rm bin/* genmodel/* */*.o ./*~ ./*~ */*.a */*.cmx */*.d ./semantic.cache
tidy:
	-rm bin/*            */*.o ./*~ ./*~ */*.a */*.cmx */*.d ./semantic.cache

#depend:
#	makedepend -Iinclude -I../rvtl/include -I../slush/include src/*.cpp -Y
# #	g++ -MM -Iinclude -I../rvtl/include -I../slush/include src/*.cpp ### but then do what with this?

