# prepare_data.py
#
# by William M. Darling (c) 2010
#
# This script prepares the necessary data files required for using the SyntaxSum Gibbs Sampler in this package.
#
# Usage: python prepare_data.py <DIR>
# where <DIR> contains the corpus in plain text files
#
# Creates "WS" (word stream), "DS" (document stream), and "WO" (words -- vocabulary)

import sys
import os
import nltk

# Returns a list of sentences
def segment_document(doc_string):

	# load the English tokenizer model
	sd = nltk.data.load('tokenizers/punkt/english.pickle')

	# tokenize document into sentences
	sentences = sd.tokenize(doc_string.strip())
	
	return sentences

# tokenize a string
def tokenize(txt):
	tokens = nltk.wordpunct_tokenize(txt)
	words = [(w.lower()) for w in filter(lambda x: x.isalpha(), tokens)]
	return words

if __name__ == '__main__':
	
	if len(sys.argv) < 2:
		print "Usage: python prepare_data.py <dir>"
		sys.exit(1)

	# load files in given directory
	docs=[]
	num=0
	for fname in os.listdir(sys.argv[1]):
		h = str(num)
		num+=1
		doc = file(os.path.join(sys.argv[1],h)).read()
		docs.append(doc)

	# get sentences (per doc)
	doc_sens=[]
	for doc in docs:
		doc_sens.append(segment_document(doc))

	# get vocab
	words=""
	for doc in docs:
		words += doc
	vocab = list(set(tokenize(words)))

	# get vocab index (dict)
	vocab_i={}
	for v in xrange(len(vocab)):
		vocab_i[vocab[v]] = v+1

	# get ordered list of word indeces with 0 marking the end of a sentence
	# AND get list of doc indeces for each word
	WS=[]
	DS=[]
	for doc in xrange(len(doc_sens)):
		for sen in doc_sens[doc]:
			for word in tokenize(sen):
				if word in vocab_i:
					WS.append(vocab_i[word])
					DS.append(doc+1)
			WS.append(0)
			DS.append(doc+1)

	# write out data...
	f = open("WO", "w")
	for v in vocab:
		s = v + "\n"
		f.write(s)
	f.close()

	f = open("WS", "w")
	for w in WS:
		s = str(w) + " "
		f.write(s)
	f.close()

	f = open("DS", "w")
	for w in DS:
		s = str(w) + " "
		f.write(s)
	f.close()
