Skip to content

Commit

Permalink
add textrank to extract keyword
Browse files Browse the repository at this point in the history
  • Loading branch information
isnowfy committed Dec 5, 2013
1 parent dddef1d commit 97072a4
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 1 deletion.
51 changes: 51 additions & 0 deletions snownlp/summary/textrank.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,54 @@ def top_index(self, limit):

def top(self, limit):
return map(lambda x: self.docs[x[0]], self.top)


class KeywordTextRank(object):

def __init__(self, docs):
self.docs = docs
self.words = {}
self.vertex = {}
self.d = 0.85
self.max_iter = 200
self.min_diff = 0.001
self.top = []

def solve(self):
for doc in self.docs:
que = []
for word in doc:
if word not in self.words:
self.words[word] = set()
self.vertex[word] = 1.0
que.append(word)
if len(que) > 5:
que.pop(0)
for w1 in que:
for w2 in que:
if w1 == w2:
continue
self.words[w1].add(w2)
self.words[w2].add(w1)
for _ in range(self.max_iter):
m = {}
max_diff = 0
for k, v in self.words.iteritems():
m[k] = 1-self.d
for j in v:
if k == j or len(self.words[j]) == 0:
continue
m[k] += (self.d/len(self.words[j])*self.vertex[j])
if abs(m[k] - self.vertex[k]) > max_diff:
max_diff = abs(m[k] - self.vertex[k])
self.vertex = m
if max_diff <= self.min_diff:
break
self.top = list(self.vertex.iteritems())
self.top = sorted(self.top, key=lambda x: x[1], reverse=True)

def top_index(self, limit):
return map(lambda x: x[0], self.top)[:limit]

def top(self, limit):
return map(lambda x: self.docs[x[0]], self.top)
6 changes: 5 additions & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,9 @@
doc.append(words)
rank = textrank.TextRank(doc)
rank.solve()
for index in rank.top_index(10):
for index in rank.top_index(5):
print sents[index]
keyword_rank = textrank.KeywordTextRank(doc)
keyword_rank.solve()
for w in keyword_rank.top_index(5):
print w

0 comments on commit 97072a4

Please sign in to comment.