
MODELDIR = genmodel
SCRIPTSDIR = scripts
SWBDTREEDATA   = /project/nlp/data/treebank/parsed/mrg/swbd
SWBDTRAINTREES = $(patsubst $(SWBDTREEDATA)/%,$(MODELDIR)/%.swbd.trees,$(wildcard $(SWBDTREEDATA)/[2-3]))
SWBDCRCTRAINSET   = $(patsubst $(SWBDTREEDATA)/%,$(MODELDIR)/%.swbd.crctrees,$(wildcard $(SWBDTREEDATA)/[2-3]))
SWBDCRCTRAINSETPOS = $(patsubst $(SWBDTREEDATA)/%,$(MODELDIR)/%.swbd.crcpostrees,$(wildcard $(SWBDTREEDATA)/[2-3]))

SWBDEVALSET = 4
SWBDLASTSENT = 6051

## RELEASE: Comment out for distribution....
.PRECIOUS: $(MODELDIR)/%.swbd.sedtrees $(MODELDIR)/%.swbd.trees $(MODELDIR)/%.swbd.cnftrees $(MODELDIR)/%.swbd.rctrees $(MODELDIR)/%.swbd.crctrees $(MODELDIR)/%.swbd.elidedtrees $(MODELDIR)/%.swbd.cnfpostrees $(MODELDIR)/%.swbd.rcpostrees $(MODELDIR)/%.swbd.crcpostrees


########################
# Steps for building rctrees
########################

$(MODELDIR)/%.swbd.trees: $(SWBDTREEDATA)/% $(SCRIPTSDIR)/tbtrees2trees.pl scripts/removePunct.sed
	@echo 'You must have clean-tools package and tsurgeon installed for this to work- you have 5 seconds to ctrl-C kill this process'
	@sleep 5
	cat $</*.mrg | sed -f scripts/removePunct.sed > $@
	clean-tool/new-clean $@

$(MODELDIR)/%.swbd.haletrees: $(MODELDIR)/%.swbd.trees scripts/haleify.sed scripts/buildModel.rb
	cat $< | ruby scripts/buildModel.rb -cdu > $@

## buildModel changes edited annotations
## propArgs propagates argumnets down - treesed propagates extractions up
$(MODELDIR)/%.swbd.sedtrees: $(MODELDIR)/%.swbd.trees $(SCRIPTSDIR)/treebinarize.pl $(SCRIPTSDIR)/propArgs.rb $(SCRIPTSDIR)/treesed.pl scripts/buildModel.rb
	cat $< | perl -p -e 's/\(S1 (.*)\)/\1/' | ruby $(SCRIPTSDIR)/buildModel.rb -$(ST1) | perl $(SCRIPTSDIR)/treebinarize.pl -p | ruby $(SCRIPTSDIR)/buildModel.rb -$(ST2) | ruby $(SCRIPTSDIR)/propArgs.rb | perl $(SCRIPTSDIR)/treesed.pl | sed 's/( *[^ ]* *)//g' > $@

## sed (ruby) magic add POS at beginning of word
## perl script is basically trees2bintrees - naively binarizes >2-ary trees that don't
## have smarter rules
$(MODELDIR)/%.swbd.cnftrees: $(MODELDIR)/%.swbd.sedtrees $(SCRIPTSDIR)/trees2cnftrees.pl
	cat $<  | ruby $(SCRIPTSDIR)/buildModel.rb -t | perl $(SCRIPTSDIR)/trees2cnftrees.pl > $@

$(MODELDIR)/%.swbd.cnfpostrees: $(MODELDIR)/%.swbd.sedtrees $(SCRIPTSDIR)/trees2cnftrees.pl
	cat $< | ruby $(SCRIPTSDIR)/buildModel.rb -tp | perl $(SCRIPTSDIR)/trees2cnftrees.pl > $@

$(MODELDIR)/%.swbd.rctrees: $(MODELDIR)/%.swbd.cnftrees $(SCRIPTSDIR)/cnftrees2flattrees.pl $(SCRIPTSDIR)/flattrees2rctrees.pl
#	cat $< | ruby scripts/cnf2rc.rb > $@
	cat $< | perl -p -e 's/^\(([^ ]+)/(UTT/' | ruby $(SCRIPTSDIR)/lowercasepreterm.rb | perl $(SCRIPTSDIR)/cnftrees2flattrees.pl | perl $(SCRIPTSDIR)/flattrees2rctrees.pl | ruby $(SCRIPTSDIR)/buildModel.rb > $@

$(MODELDIR)/%.swbd.rcpostrees: $(MODELDIR)/%.swbd.cnfpostrees $(SCRIPTSDIR)/cnftrees2flattrees.pl $(SCRIPTSDIR)/flattrees2rctrees.pl
#	cat $< | ruby scripts/cnf2rc.rb > $@
	cat $< | perl -p -e 's/^\(([^ ]+)/(UTT/' | ruby $(SCRIPTSDIR)/lowercasepreterm.rb | perl $(SCRIPTSDIR)/cnftrees2flattrees.pl | perl $(SCRIPTSDIR)/flattrees2rctrees.pl | ruby scripts/buildModel.rb > $@

$(MODELDIR)/%.swbd.crctrees: $(MODELDIR)/%.swbd.rctrees $(SCRIPTSDIR)/countRights.rb $(SCRIPTSDIR)/removeElided.rb
	cat $< | grep -v '^$$' | ruby $(SCRIPTSDIR)/removeElided.rb | ruby $(SCRIPTSDIR)/countRights.rb | sed 's/(\([^ ]*\)\^[0-9] *\([^ ]*\)\^[0-9] *)/(\1 \2)/g' | perl -p -e 's/^\(utt(.*) : (.*)/(UTT (utt\1) : \2/' > $@

$(MODELDIR)/%.swbd.crcpostrees: $(MODELDIR)/%.swbd.rcpostrees $(SCRIPTSDIR)/countRights.rb
	cat $< | grep -v '^$$' | ruby $(SCRIPTSDIR)/removeElided.rb | perl -p -e 's/^\(utt(.*) : (.*)/(UTT (utt\1) : \2/' | ruby $(SCRIPTSDIR)/countRights.rb | sed 's/(\([^ ]*\)\^[0-9] *\([^ ]*\)\^[0-9] *)/(\1 \2)/g' > $@

## Switchboard-specific POS model
genmodel/swbd.dat: $(SWBDCRCTRAINSET) genmodel/eos.tree $(SCRIPTSDIR)/trees2dat.rb
	cat $(SWBDCRCTRAINSET) genmodel/eos.tree | grep -v '\^[4-9]' | sed 's/\^[1-3]//g' | ruby $(SCRIPTSDIR)/trees2dat-sr.rb | grep '^Pw' | perl -p -e 's/(.*) =.*/\1/' > $@

genmodel/swbd.dat.pos: $(SWBDCRCTRAINSETPOS) genmodel/eos.swbdcrctrees $(SCRIPTSDIR)/trees2dat.rb
	cat $(SWBDCRCTRAINSETPOS) genmodel/eos.swbdcrctrees | grep -v '\^[4-9]' | sed 's/\^[1-3]//g' | ruby $(SCRIPTSDIR)/trees2dat-sr.rb | grep '^Pw' | perl -p -e 's/(.*) =.*/\1/' > $@

$(MODELDIR)/POS.swbd.model:
	@echo 'Do you really want to re-build this big $@ file?  If not, CTRL-C and copy it from somewhere!'
	@sleep 5
	make $(MODELDIR)/swbd.dat.pos bin/postrainer $(SCRIPTSDIR)/relfreq.pl
	cat $(MODELDIR)/swbd.dat.pos | grep '^Pw' | sed 's/^Pw *\([^ ]*\) *: *\([^ ]*\)/PwDT \1 : \2/' | bin/postrainer | grep -v "WARNING: distribution Y_prior: no values" > $@
	cat $(MODELDIR)/swbd.dat.pos | grep '^Pw' | sed 's/^Pw *\([^ ]*\) *: *\([^ ]*\)/P : \2/' | perl $(SCRIPTSDIR)/relfreq.pl >> $@

genmodel/$(PARSERMODEL): $(SWBDCRCTRAINSETPOS) $(MODELDIR)/eos.swbdcrctrees
	@echo 'Do you really want to re-build this big $@ file?  If not, CTRL-C and copy it from somewhere!'
	@sleep 5
	cat $(SWBDCRCTRAINSETPOS) $(MODELDIR)/eos.swbdcrctrees | grep -v '\^[5-9]' | sed 's/\^[0-9]//g' | ruby scripts/trees2dat-noplus.rb > QF.swbd.dat
	cat QF.swbd.dat | grep '^Pw ' | perl scripts/relfreq.pl -c 3| sort > genmodel/$(PARSERMODEL)
	cat QF.swbd.dat | grep -v '^Pw ' | perl scripts/relfreq.pl | perl scripts/sortbyprob.pl >> genmodel/$(PARSERMODEL)
	cat QF.swbd.dat | grep '^Pw ' | perl scripts/relfreq.pl -c 3 | perl -n -e 'if(s/^Pw ([^ ]+).*/W : \1 = 1.0/){ print }' | sort -u >> genmodel/$(PARSERMODEL)

## Targets for running hhmm parser on swbd
# Dev set
%.swbd.dev.in: $(MODELDIR)/$(SWBDEVALSET).swbd.sedtrees
	tail -300 $(MODELDIR)/$(SWBDEVALSET).swbd.sedtrees > $@
#	make hhmm.swbd.dev.dat

%.swbd.test.in:
	head -6051 $(MODELDIR)/$(SWBDEVALSET).swbd.sedtrees > $@
#	make hhmm.swbd.test.dat

# Test set
hhmm.%.dat:
	make genmodel/$(PARSERMODEL)
	make genmodel/$(POSMODEL)
	make bin/$(PARSER)
	make $*.in
	@echo 'Do you really want to re-build this big $@ file?  If not, CTRL-C and copy it from somewhere!'
	@sleep 5
	cat $*.in | sed 's/(\([A-Z\$$]*\)[^ ]* \([^()]*\))/(\1 \2\1)/g;s/([^ ]*//g;s/)//g;s/[^ \/]*\#//g' | nice bin/$(PARSER) -b 500 genmodel/$(PARSERMODEL) genmodel/POS.swbd.model > hhmm.$*.dat
	rm $*.in

hhmm.%.hypoth.evalform: hhmm.%.dat $(SCRIPTSDIR)/hypoths2rctrees.pl $(SCRIPTSDIR)/rctrees2flattrees.pl $(SCRIPTSDIR)/flattrees2cnftrees.pl $(SCRIPTSDIR)/unbinarize.pl
	cat hhmm.$*.dat | perl $(SCRIPTSDIR)/hypoths2rctrees.pl | perl $(SCRIPTSDIR)/rctrees2flattrees.pl | perl $(SCRIPTSDIR)/flattrees2cnftrees.pl | sed 's/[^ \/]*\#//g' | ruby $(SCRIPTSDIR)/uppercasepreterm.rb | ruby scripts/unbuildModel.rb -l | perl $(SCRIPTSDIR)/unbinarize.pl | perl -p -e 's/([A-Z]+)[-a-z\$$]+[-a-zA-Z\$$]*/\1/g' | ruby scripts/unbuildModel.rb -ed | sed 's/EDITED[^ ]*/EDITED/g' | perl -p -e 's/\((WH)?NP ([^(]+)\)/(\1NP (prp \2))/g' | perl -p -e 's/^\(UTT ([^(]+)\)$$/(UTT (UH \1))/' | perl -p -e 's/\(PRT ([^(]+)\)/(PRT (pos \1))/g' | perl -p -e 's/\(INTJ ([^(]+)\)/(INTJ (UH \1))/g' | perl -p -e 's/\(ADVP ([^(]+)\)/(ADVP (RB \1))/g' | perl -p -e 's/[A-Z]+\)/)/g' | perl -p -e 's/EDITED ([^(]+)\)/EDITED (pos \1))/' > $@

sp.swbd.dev.hypoth.evalform: stanford-parser/sp.swbd.dev.out
	cat $< | perl -p -e 's/UNDERSCORE/_/g' | perl -p -e 's/\(ROOT (.*)\)/\1/' | ruby scripts/unbuildModel.rb -s | perl $(SCRIPTSDIR)/rctrees2flattrees.pl | perl $(SCRIPTSDIR)/flattrees2cnftrees.pl | perl $(SCRIPTSDIR)/unbinarize.pl | perl -p -e 's/([A-Z]+)[-a-z\$$]+[-a-zA-Z\$$]*/\1/g' | ruby scripts/unbuildModel.rb -de | sed 's/EDITED[^ ]*/EDITED/g' | perl -p -e 's/[A-Z]+\)/)/g' | perl -p -e 's/UNF//g'> $@

sp.swbd.dev.gold.evalform: $(MODELDIR)/$(SWBDEVALSET).swbd.trees
	tail -300 $(MODELDIR)/$(SWBDEVALSET).swbd.trees | perl -p -e 's/\(S1(.*)\)/\1/g' > $@

sp.swbd.dev.eval: sp.swbd.dev.gold.evalform sp.swbd.dev.hypoth.evalform
	bin/evalb -p srcmodel/new.prm $^ > $@

%.swbd.dev.gold.evalform: $(MODELDIR)/$(SWBDEVALSET).swbd.trees
	tail -300 $(MODELDIR)/$(SWBDEVALSET).swbd.trees | perl -p -e 's/\(S1(.*)\)/\1/g' | perl -p -e 's/^ \([^ ]+/(UTT/g' > $@

%.swbd.test.gold.evalform: $(MODELDIR)/$(SWBDEVALSET).swbd.trees
	head -6051 $(MODELDIR)/$(SWBDEVALSET).swbd.trees | perl -p -e 's/\(S1(.*)\)/\1/g' | perl -p -e 's/^ \([^ ]+/(UTT/g' > $@

#%eval.out: hhmm.swbd.%.gold.evalform hhmm.swbd.%.evalform
#	bin/evalb -p srcmodel/new.prm $^ > $@

