# syntax_sumA.py
#
# by William M. Darling (c) 2010
#
# Use SyntaxSum content distributions with SumBasic sentence scoring (SBH)
#
# usage: python syntax_sumA.py <data dir>
# where <data dir> contains the WP.txt zeta matrix

import sys
import nltk
from string import ascii_letters as ascii
from math import log

#settings
corpus_name = "DUC 2006"	# for output -- dataset description
alg_name = "SBH"		# for output -- algorithm description
numpd = 25			# number of documents per directory
OUT = "OUT"			# output directory
DIRE = "DUC"			# input directory
maxwords = 250			# max summary length
ndocs = 1250			# total number of documents in collection


# Returns a list of sentences
def segment_document(doc_string):

	# load the English tokenizer model
	sd = nltk.data.load('tokenizers/punkt/english.pickle')

	# tokenize document into sentences
	sentences = sd.tokenize(doc_string.strip())
	
	return sentences

# tokenize a string
def tokenize(txt):
	tokens = nltk.wordpunct_tokenize(txt)
	words = [(w.lower()) for w in filter(lambda x: x.isalpha(), tokens)]
	return words


if __name__ == '__main__':

	if len(sys.argv) < 2:
		print "Usage: python syntax_sumA.py <data dir>"
		sys.exit(1)
	DIR = sys.argv[1]
	
	# open zeta matrix
	zeta=[]
	tmp={}
	content_words=[]
	fn = DIR + "/WP.txt"
	for line in open(fn).readlines():
		if line[0] == 'T':
			if tmp:
				zeta.append(tmp)
				content_words += tmp
			tmp={}
		elif line[0].strip() != "":
			l = line.split()
			tmp[l[0]] = l[1]
	zeta.append(tmp)

	# do summarization...
	print("Summarizing %s articles using %s algorithm..." % (corpus_name, alg_name))

	sentences=[]
	scores=[]
	locations=[]
	hpts=[]

	D=0
	for d in xrange(ndocs):

		fn = "%s/%d" % (DIRE,d)
		
		# open document
		text = open(fn, "r").read()

		# get sentences
		sens = segment_document(text)
		sentences += sens

		# give each sentence a score in [0,1] showing where it was found in its respective document
		num_sens = len(sens)
		for s in xrange(num_sens):
			if num_sens > 1:
				loc = float(s) / float(num_sens - 1)
			else:
				loc = 0
			locations.append(loc)

		if (d+1) % numpd == 0:		# got all the sentences

			print "Summarizing Directory", D

			dist = zeta[D]

			scores=[]
			for s in sentences:
				score=0
				for w in tokenize(s):
					if w in dist:
						score += float(dist[w])

				if score != 0:
					scores.append(float(score) / float(len(tokenize(s))))
				else:
					scores.append(float(0))

			s={}
			for j in xrange(len(sentences)):	# for each sentence in this collection
				s[j] = scores[j]

			x = sorted([(value,key) for (key,value) in s.items()], reverse=True)

			added=[]
			output=""
			while len(output.split()) < maxwords:

				# add chosen sentence to 'added' list
				for i in xrange(len(x)):
					if x[i][1] not in added:
						added.append(x[i][1])
						output += sentences[x[i][1]] + " "
						break

				# perform the redundancy removal update
				words = tokenize(sentences[x[i][1]])
				for w in words:
					if w in dist:
						dist[w] = float(dist[w]) * float(dist[w])

				# re-compute scores
				scores=[]
				for s in sentences:
					if s in added:
						scores.append(0)
					else:
						score=0
						for w in tokenize(s):
							if w in dist:
								score += float(dist[w])

						if score != 0:
							scores.append(float(score) / float(len(tokenize(s))))
						else:
							scores.append(float(0))

				s={}
				for j in xrange(len(sentences)):	# for each sentence in this collection
					s[j] = scores[j]

				x = sorted([(value,key) for (key,value) in s.items()], reverse=True)

			# order summary according to sentence index scores
			added = sorted(added, key=lambda x: locations[x])
			
			# build summary (in order)
			output=""
			for s in added:
				output += sentences[s] + " "

			# truncate summary to 'maxwords' words
			ww = output.split()
			s=""
			for i in xrange(maxwords):
				s += ww[i] + " "
			output = s

			fn = OUT + "/" + str(D)
			ff = open(fn, "w")
			ff.write(output)
			ff.close()

			# NEXT...
			D += 1
			sentences=[]
			scores=[]
			locations=[]
			hpts=[]
