Skip to content

Commit

Permalink
wmd working
Browse files Browse the repository at this point in the history
  • Loading branch information
arnavkj1995 committed Apr 9, 2016
1 parent 258e89c commit 3a3a513
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 71 deletions.
6 changes: 3 additions & 3 deletions scripts/llda.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def set_corpus(self, labelset, corpus, labels):
labelset.insert(0, "common")
self.labelmap = dict(zip(labelset, range(len(labelset))))
self.K = len(self.labelmap)

print (self.K)
self.vocas = []
self.vocas_id = dict()
self.labels = numpy.array([self.complement_label(label) for label in labels])
Expand Down Expand Up @@ -142,8 +142,8 @@ def main():

phi = llda.phi()
for v, voca in enumerate(llda.vocas):
#print ','.join([voca]+[str(x) for x in llda.n_z_t[:,v]])
print len(phi[:,v])#print ','.join([voca]+[str(x) for x in phi[:,v]])
print (','.join([voca]+[str(x) for x in llda.n_z_t[:,v]]))
print (','.join([voca]+[str(x) for x in phi[:,v]]))

if __name__ == "__main__":
main()
129 changes: 61 additions & 68 deletions scripts/wmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,22 @@
from numpy import genfromtxt
import sys
import json, gensim, logging
from sklearn.cross_validation import train_test_split
from sklearn import metrics
import random
from collections import Counter


model = gensim.models.Word2Vec.load('/home/adarsh/wordmodel')
with open('clean.json') as data_file:
model = gensim.models.Word2Vec.load_word2vec_format('vectors.bin', binary = True)
with open('text.json') as data_file:
data=json.load(data_file)# type(data)=dict
ques=list(data.keys())
# for i in data:
# if(len(i)!=0):
# ques.append(i)
print(len(ques))
# ques=[q for q in ques1 if(len(q)!=0)]
# print(len(ques))
# for i in range(len(ques)):
# if(len(ques[i])==0):
# del ques[i]
# print(len(ques))

ques, tags = [], []
for key, value in data.items():
ques.append(key)
tags.append(data[key])

questions_train, questions_test, tags_train, tags_test = train_test_split(ques, tags, test_size=0.001, random_state = random.randint(1, 100))
print type(tags_test)
tokenizer = RegexpTokenizer(r'\w+')
stop = stopwords.words('english')
dictionary = enchant.request_dict("en_US")
Expand All @@ -34,35 +34,32 @@ def clean_ques(query):
# here query is list of words that are present in the question
query = query.lower()# converted to lowercase alphabet
query = tokenizer.tokenize(query) # tokenized
# for i in range(len(ques)):
# if not enchant.dict_exists(ques[i]):
# ques[i] = dictionary.suggest(sent)[0]
query = [q for q in query if q not in stop] # removed stop words
return query
#word1=clean_ques("What are the best places to learn how to dance in Philadelphia?")
#word2=clean_ques("Political Science: What factors tend shift a population in a more liberal or more conservative direction?")

def wordvec(word):
return model[word]
try:
return model[word]
except KeyError:
pass
return numpy.zeros(len(model["one"]))

#Get the Word Centroid Distance
def wcd(sent1, sent2):
# here sent1 & sent2 both are list of words
try:
if(len(sent1)>0 and len(sent2)>0):
s1 = wordvec(sent1[0])
s2 = wordvec(sent2[0])
else:
return 10000
for i in range(1,len(sent1)):
s1 = s1 + wordvec(sent1[i])
for i in range(1,len(sent2)):
s2 = s2 + wordvec(sent2[i])

s1 = s1 / len(sent1)
s2 = s2 / len(sent2)
return numpy.linalg.norm(s1 - s2) # returns the norm of the difference of the two vectors
except KeyError:
if(len(sent1)>0 and len(sent2)>0):
s1 = wordvec(sent1[0])
s2 = wordvec(sent2[0])
else:
return 10000
for i in range(1,len(sent1)):
s1 = s1 + wordvec(sent1[i])
for i in range(1,len(sent2)):
s2 = s2 + wordvec(sent2[i])

s1 = s1 / len(sent1)
s2 = s2 / len(sent2)
return numpy.linalg.norm(s1 - s2) # returns the norm of the difference of the two vectors
#print(word1)

#Get the Relaxed Word Mover Distance
Expand Down Expand Up @@ -92,15 +89,8 @@ def rwmd(sent1, sent2):
dist2 = dist2 + (1.0 / len(sent2)) * d

return max(dist1, dist2)
#print(rwmd(word2,word2))
#print(word2)
#print(rwMd(word1, word1))
#print(rwmd(word1, word2))

def getwcd(query, num):
print("running function getwcd\nquery= ")
print(query,num)
# closest similar num ques chahiye
dic={}
for i in range(len(ques)):
if(len(ques[i])==0):
Expand All @@ -114,54 +104,57 @@ def getwcd(query, num):
if(dic[m]>val):
del dic[m]
dic[ques[i]]=val
#create a priority queue to stope the dist
print("loop finished")
return list(dic.keys())

def getrwmd(query, kwcd, num):
print("running function getrwmd\n query= ")
print(query,num)
dic={}
for i in range(len(kwcd)):
ques1=clean_ques(kwcd[i])
val=rwmd(query,ques1)
print (kwcd[i], val)
#print (kwcd[i], val)
if (len(dic)<num):
dic[kwcd[i]]=val
else:
m=max(dic,key=dic.get)
if(dic[m]>val):
del dic[m]
dic[kwcd[i]]=val
print("loop ended")
return list(dic.keys())
#create priority queue to store the dist
#return top num values

def getkNN(query, num):
print("calling getwcd")
kwcd = getwcd(query, 5 * num)
print("getwcd call finished")
print("length of kwcd= "+str(len(kwcd)))
print("calling getrwmd")
knn = getrwmd(query, kwcd, num)
print("getrwmd call finished")
return knn

def getTagsSimilarQues(query):
print("calling getTagsSimilarQues")
def getTagsSimilarQues(query, k = 5):

knn = getkNN(query, 50)
print("getkNN call finished")
for i in range(len(knn)):
print ( knn[i])
#print(knn)
#return tags of all 50 questions returned with count of occurrence
def main():
query=input("enter the question")
query=clean_ques(query)
print("query cleaned")
print(query)
getTagsSimilarQues(query)

if __name__ == '__main__':
main()
#return tags of all 50 questions returned with count of occurrence
tags=[]
for i in knn:
tags.extend(data[i])
tag1=Counter(tags).most_common(k)
tag=[tag1[i][0] for i in range(len(tag1))]
print 'here', type(tag)
return tag

pred = []
tt = []
print len(questions_test)
for i in range(len(questions_test)):
print (i + 1)
q = clean_ques(questions_test[i])
pred.append(getTagsSimilarQues(q, 5))
tt.append(data[questions_test[i]])

for i in range(len(tt)):
print pred[i]
print tt[i]
print '\n'
#precision, recall, f1_score, support = metrics.precision_recall_fscore_support(tt, pred, average='micro')
print("precision: %0.2f %%" % (precision * 100))
print("recall: %0.2f %%" % (recall * 100))
print("f score: %0.4f" % f1_score)

0 comments on commit 3a3a513

Please sign in to comment.