wmd working

nishnik · Apr 9, 2016 · 3a3a513 · 3a3a513
1 parent 258e89c
commit 3a3a513
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 71 deletions.
diff --git a/scripts/llda.py b/scripts/llda.py
@@ -55,7 +55,7 @@ def set_corpus(self, labelset, corpus, labels):
         labelset.insert(0, "common")
         self.labelmap = dict(zip(labelset, range(len(labelset))))
         self.K = len(self.labelmap)
-
+        print (self.K)
         self.vocas = []
         self.vocas_id = dict()
         self.labels = numpy.array([self.complement_label(label) for label in labels])
@@ -142,8 +142,8 @@ def main():
 
     phi = llda.phi()
     for v, voca in enumerate(llda.vocas):
-        #print ','.join([voca]+[str(x) for x in llda.n_z_t[:,v]])
-        print len(phi[:,v])#print ','.join([voca]+[str(x) for x in phi[:,v]])
+        print (','.join([voca]+[str(x) for x in llda.n_z_t[:,v]]))
+        print (','.join([voca]+[str(x) for x in phi[:,v]]))
 
 if __name__ == "__main__":
     main()
diff --git a/scripts/wmd.py b/scripts/wmd.py
@@ -9,22 +9,22 @@
 from numpy import genfromtxt
 import sys
 import json, gensim, logging
+from sklearn.cross_validation import train_test_split
+from sklearn import metrics
+import random
+from collections import Counter
 
-
-model = gensim.models.Word2Vec.load('/home/adarsh/wordmodel')
-with open('clean.json') as data_file:
+model = gensim.models.Word2Vec.load_word2vec_format('vectors.bin', binary = True)
+with open('text.json') as data_file:
 	data=json.load(data_file)# type(data)=dict
-ques=list(data.keys())
-# for i in data:
-# 	if(len(i)!=0):
-# 		ques.append(i)
-print(len(ques))
-# ques=[q for q in ques1 if(len(q)!=0)]
-# print(len(ques))
-# for i in range(len(ques)):
-# 	if(len(ques[i])==0):
-# 		del ques[i]
-# print(len(ques))
+
+ques, tags = [], []
+for key, value in data.items():
+	ques.append(key)
+	tags.append(data[key])
+
+questions_train, questions_test, tags_train, tags_test = train_test_split(ques, tags, test_size=0.001, random_state = random.randint(1, 100))
+print type(tags_test)
 tokenizer = RegexpTokenizer(r'\w+')
 stop = stopwords.words('english')
 dictionary = enchant.request_dict("en_US")
@@ -34,35 +34,32 @@ def clean_ques(query):
 	# here query is list of words that are present in the question
 	query = query.lower()# converted to lowercase alphabet
 	query = tokenizer.tokenize(query) # tokenized
-	# for i in range(len(ques)):
-	# 	if not enchant.dict_exists(ques[i]):
- 	# 		ques[i] = dictionary.suggest(sent)[0]
 	query = [q for q in query if q not in stop] # removed stop words
 	return query
-#word1=clean_ques("What are the best places to learn how to dance in Philadelphia?")
-#word2=clean_ques("Political Science: What factors tend shift a population in a more liberal or more conservative direction?")
+
 def wordvec(word):
-	return model[word]
+	try:
+		return model[word]
+	except KeyError:
+		pass	
+	return numpy.zeros(len(model["one"]))	
 
 #Get the Word Centroid Distance
 def wcd(sent1, sent2):
 	# here sent1 & sent2 both are list of words
-	try:
-		if(len(sent1)>0 and len(sent2)>0):
-			s1 = wordvec(sent1[0])
-			s2 = wordvec(sent2[0])
-		else:
-			return 10000
-		for i in range(1,len(sent1)):
-			s1 = s1 + wordvec(sent1[i])
-		for i in range(1,len(sent2)):
-			s2 = s2 + wordvec(sent2[i])
-
-		s1 = s1 / len(sent1)
-		s2 = s2 / len(sent2)	
-		return numpy.linalg.norm(s1 - s2) # returns the norm of the difference of the two vectors	
-	except KeyError:
+	if(len(sent1)>0 and len(sent2)>0):
+		s1 = wordvec(sent1[0])
+		s2 = wordvec(sent2[0])
+	else:
 		return 10000
+	for i in range(1,len(sent1)):
+		s1 = s1 + wordvec(sent1[i])
+	for i in range(1,len(sent2)):
+		s2 = s2 + wordvec(sent2[i])
+
+	s1 = s1 / len(sent1)
+	s2 = s2 / len(sent2)	
+	return numpy.linalg.norm(s1 - s2) # returns the norm of the difference of the two vectors	
 #print(word1)
 
 #Get the Relaxed Word Mover Distance
@@ -92,15 +89,8 @@ def rwmd(sent1, sent2):
 		dist2 = dist2 + (1.0 / len(sent2)) * d	
 
 	return max(dist1, dist2)			
-#print(rwmd(word2,word2))
-#print(word2)
-#print(rwMd(word1, word1))
-#print(rwmd(word1, word2))
 
 def getwcd(query, num):
-	print("running function getwcd\nquery= ")
-	print(query,num)
-	# closest similar num ques chahiye
 	dic={}
 	for i in range(len(ques)):
 		if(len(ques[i])==0):
@@ -114,54 +104,57 @@ def getwcd(query, num):
 			if(dic[m]>val):
 				del dic[m]
 				dic[ques[i]]=val
-		#create a priority queue to stope the dist
-	print("loop finished")
 	return list(dic.keys())
 
 def getrwmd(query, kwcd, num):
-	print("running function getrwmd\n query= ")
-	print(query,num)
 	dic={}
 	for i in range(len(kwcd)):
 		ques1=clean_ques(kwcd[i])
 		val=rwmd(query,ques1)
-		print (kwcd[i], val)
+		#print (kwcd[i], val)
 		if (len(dic)<num):
 			dic[kwcd[i]]=val
 		else:
 			m=max(dic,key=dic.get)
 			if(dic[m]>val):
 				del dic[m]
 				dic[kwcd[i]]=val
-	print("loop ended")
 	return list(dic.keys())
 		#create priority queue to store the dist
 	#return top num values	
 
 def getkNN(query, num):
-	print("calling getwcd")
 	kwcd = getwcd(query, 5 * num)
-	print("getwcd call finished")
-	print("length of kwcd= "+str(len(kwcd)))
-	print("calling getrwmd")
 	knn = getrwmd(query, kwcd, num)
-	print("getrwmd call finished")
 	return knn
 
-def getTagsSimilarQues(query):
-	print("calling getTagsSimilarQues")
+def getTagsSimilarQues(query, k = 5):
+
 	knn = getkNN(query, 50)
-	print("getkNN call finished")
-	for i in range(len(knn)):
-		print (	knn[i])
 	#print(knn)
-	#return tags of all 50 questions returned with count of occurrence	
-def main():
-	query=input("enter the question")
-	query=clean_ques(query)
-	print("query cleaned")
-	print(query)
-	getTagsSimilarQues(query)
-
-if __name__ == '__main__':
-	main()
+	#return tags of all 50 questions returned with count of occurrence
+	tags=[]
+	for i in knn:
+		tags.extend(data[i])
+	tag1=Counter(tags).most_common(k)
+	tag=[tag1[i][0] for i in range(len(tag1))]
+	print 'here', type(tag)
+	return tag	
+
+pred = []
+tt = []
+print len(questions_test)
+for i in range(len(questions_test)):
+	print (i + 1)
+	q = clean_ques(questions_test[i])
+	pred.append(getTagsSimilarQues(q, 5))
+	tt.append(data[questions_test[i]])
+
+for i in range(len(tt)):
+	print pred[i]
+	print tt[i]
+	print '\n'
+#precision, recall, f1_score, support = metrics.precision_recall_fscore_support(tt, pred, average='micro')
+print("precision: %0.2f %%" % (precision * 100))
+print("recall: %0.2f %%" % (recall * 100))
+print("f score: %0.4f" % f1_score)