# topics.py
#
# by William M. Darling (c) 2010
#
# This script is based on the script 'topics.py' distributed with the package lda-c-dist
# by David M. Blei and available at http://www.cs.princeton.edu/~blei/lda-c/.
#
# This is a python script which allows a user to quickly examine the document set topic 
# distributions learned by the SyntaxSum model. This can be useful to get a heuristically
# accurate determination of whether or not the Markov chain has reached its stationary 
# distribution; if there are no "stop" words in the topics and the top words appear to be 
# logically correct, then it is likely that the chain has converged.
#
# usage: python topics.py <zeta file> <vocab file> <num words>
#
# <zeta file> is the file "zeta" output by the syntaxsum Gibbs sampler
# <vocab file> is the corpus vocabulary "WO" created by the script prepare_day.py
# <num words> is the number of words to print from each document set topic

import sys

def print_topics(zeta_file, vocab_file, nwords):

	# get the vocabulary
	vocab = file(vocab_file, 'r').readlines()
	vocab = map(lambda x: x.strip(), vocab)

	indices = range(len(vocab))
	topic_no = 0
	
	for topic in file(zeta_file, 'r'):
		print('\nDocument Set %d' % topic_no)
		print('---------------')
		topic = map(float, topic.split())
		indices.sort(lambda x,y: -cmp(topic[x], topic[y]))
        
		for i in range(nwords):
			print('%s' % vocab[indices[i]])
        
		topic_no = topic_no + 1
        print '\n'

if __name__ == '__main__':

	if len(sys.argv) != 4:
		print 'usage: python topics.py <zeta file> <vocab file> <num words>\n'
		sys.exit(1)

	zeta_file = sys.argv[1]
	vocab_file = sys.argv[2]
	nwords = int(sys.argv[3])
	
	print_topics(zeta_file, vocab_file, nwords)
