import numpy as np
from matplotlib.mlab import PCA as mlabPCA
from matplotlib import pyplot as plt


commonWordFile = open("../Common1000Word_150409.txt", 'r')
wordNum = 102
selectedWikiVec = {}
selectedTwitterVec = {}
wordList = []

count = 0
for line in commonWordFile:
	eachWord = line.strip()
	selectedWikiVec.update({eachWord:0})
	selectedTwitterVec.update({eachWord:0})
	wordList.append(eachWord)
	count+=1
	if count ==wordNum-2:
		break

selectedWikiVec.update({"i":0})
selectedWikiVec.update({"you":0})
selectedTwitterVec.update({"i":0})
selectedTwitterVec.update({"you":0})
wordList.append("i")
wordList.append("you")

wikiVectorFile = open("../wikipedia_text_vector_cleaned_0406.txt", 'r')
count =0
for line in wikiVectorFile:
	oneRecord = line.split()
	word =  oneRecord[0].strip()
	if word in selectedWikiVec:
		print word
		selectedWikiVec[word] = (line.strip()).replace(word,'');
		count+=1
		# print selectedWikiVec[word]
		if count == wordNum:
			break


tweetVectorFile = open("../tweet_text_vector_cleaned_0406.txt", 'r')
count =0
for line in tweetVectorFile:
	oneRecord = line.split()
	word =  oneRecord[0].strip()
	if word in selectedTwitterVec:
		# print word
		selectedTwitterVec[word] = (line.strip()).replace(word,'');
		count+=1
		# print selectedTwitterVec[word]
		if count == wordNum:
			break


a = np.zeros(shape=(wordNum,300))
b = np.zeros(shape=(wordNum,300))
count = 0
for word in wordList:
	# print selectedWikiVec[word]
	a[count] = selectedWikiVec[word].strip().split()
	b[count] = selectedTwitterVec[word].strip().split()
	count+=1

# a = a.transpose()
# b = b.transpose()

# np.random.seed(234234782384239784) # random seed for consistency

# A reader pointed out that Python 2.7 would raise a
# "ValueError: object of too small depth for desired array".
# This can be avoided by choosing a smaller random seed, e.g. 1
# or by completely omitting this line, since I just used the random seed for
# consistency.

# mu_vec1 = np.array([0,0,0])
# cov_mat1 = np.array([[1,0,0],[0,1,0],[0,0,1]])
# class1_sample = np.random.multivariate_normal(mu_vec1, cov_mat1, 20).T



selectedTransformW2TVec = {}
c = np.zeros(shape=(wordNum,300))
transforW2TVectorFile = open("sorted_afterTranformation_w2t_Matirx_0422.vecs.txt", 'r')
count =0
for line in transforW2TVectorFile:
	if(count>=0 and count < wordNum-2):
		c[count] = line.strip().split()
	elif count == 159:
		c[wordNum -2] = line.strip().split()
	elif count == 561:
		c[wordNum -1] = line.strip().split()	
		break
	count+=1





assert a.shape == (wordNum,300), "The matrix has not the dimensions 300x10"

# mu_vec2 = np.array([1,1,1])
# cov_mat2 = np.array([[1,0,0],[0,1,0],[0,0,1]])
# class2_sample = np.random.multivariate_normal(mu_vec2, cov_mat2, 20).T
assert b.shape == (wordNum,300), "The matrix has not the dimensions 300x10"
assert c.shape == (wordNum,300), "The matrix has not the dimensions 300x10"

all_samples = np.concatenate((a, b, c), axis=0)

assert all_samples.shape == (wordNum*3,300), "The matrix has not the dimensions 300x20"









mlab_pca = mlabPCA(all_samples)

print('PC axes in terms of the measurement axes'\
        ' scaled by the standard deviations:\n',\
          mlab_pca.Wt)


fig, ax = plt.subplots()

# compareNum = 15
selectedCompareWordList = [2,5,6,10,100,101]
# for i in range(compareNum):
for i in selectedCompareWordList:
	wikipoint, = plt.plot(mlab_pca.Y[i,0],mlab_pca.Y[i,1], 'o', markersize=18,\
	        color='blue', alpha=0.5)
	ax.annotate(wordList[i], (mlab_pca.Y[i,0],mlab_pca.Y[i,1]), xycoords='data',
                xytext=(40, -30), textcoords='offset points',
                arrowprops=dict(arrowstyle="->"),
                size=25)
	twitterpoint, = plt.plot(mlab_pca.Y[wordNum+i,0], mlab_pca.Y[wordNum+i,1], '^', markersize=18,\
	        color='red', alpha=0.5)

	if i == 2:
		ax.annotate(wordList[i], (mlab_pca.Y[wordNum+i,0], mlab_pca.Y[wordNum+i,1]),xycoords='data',
                xytext=(-30, 50), textcoords='offset points',
                arrowprops=dict(arrowstyle="->"),
                size=25)
	else:
		ax.annotate(wordList[i], (mlab_pca.Y[wordNum+i,0], mlab_pca.Y[wordNum+i,1]),xycoords='data',
                xytext=(-20, -50), textcoords='offset points',
                arrowprops=dict(arrowstyle="->"),
                size=25)

	w2tpoint, = plt.plot(mlab_pca.Y[wordNum*2+i,0], mlab_pca.Y[wordNum*2+i,1], 's', markersize=18,\
	        color='green', alpha=0.5)

	if i == 5:
		ax.annotate(wordList[i], (mlab_pca.Y[wordNum*2+i,0], mlab_pca.Y[wordNum*2+i,1]),xycoords='data',
                xytext=(-40, 50), textcoords='offset points',
                arrowprops=dict(arrowstyle="->"),
                size=25)
	else:
		ax.annotate(wordList[i], (mlab_pca.Y[wordNum*2+i,0], mlab_pca.Y[wordNum*2+i,1]),xycoords='data',
                xytext=(20, 50), textcoords='offset points',
                arrowprops=dict(arrowstyle="->"),
                size=25)
# plt.plot(mlab_pca.Y[0:compareNum,0],mlab_pca.Y[0:compareNum,1], 'o', markersize=7,\
#         color='blue', alpha=0.5, label='Wiki Vector')
# plt.plot(mlab_pca.Y[wordNum:wordNum+compareNum,0], mlab_pca.Y[wordNum:wordNum+compareNum,1], '^', markersize=7,\
#         color='red', alpha=0.5, label='Twitter Vector')
# plt.plot(mlab_pca.Y[wordNum*2:wordNum*2+compareNum,0], mlab_pca.Y[wordNum*2:wordNum*2+compareNum,1], 's', markersize=7,\
#         color='green', alpha=0.5, label='W2T Vector')

plt.legend([wikipoint, twitterpoint, w2tpoint], ['Wiki Vector', 'Twitter Vector', 'W2T Vector'])
plt.plot(None,None, 'o', markersize=18,\
        color='blue', alpha=0.5, label='Wiki Vector')
plt.plot(None, None, '^', markersize=18,\
        color='red', alpha=0.5, label='Twitter Vector')
plt.plot(None, None, 's', markersize=18,\
        color='green', alpha=0.5, label='W2T Vector')

# plt.xlabel(None,fontsize =30)
# plt.ylabel(None, fontsize =30)
plt.xlim([-13,13])
plt.ylim([-13,13])
plt.legend(fontsize =20)
# plt.title('Transformed samples with class labels from matplotlib.mlab.PCA()')

plt.show()