Skip to content

Commit

Permalink
change the dir
Browse files Browse the repository at this point in the history
  • Loading branch information
isnowfy committed Nov 29, 2013
1 parent 6790bf1 commit 3d66ca0
Show file tree
Hide file tree
Showing 12 changed files with 101 additions and 0 deletions.
Empty file added snownlp/__init__.py
Empty file.
2 changes: 2 additions & 0 deletions snownlp/normal/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
50 changes: 50 additions & 0 deletions snownlp/sim/bm25.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import math


class BM25(object):

def __init__(self, docs):
self.D = len(docs)
self.avgdl = 0
self.docs = docs
self.f = []
self.df = {}
self.idf = {}
self.k1 = 1.5
self.b = 0.75
self.init()

def init(self):
for doc in self.docs:
tmp = {}
for word in doc:
if not word in tmp:
tmp[word] = 0
tmp[word] += 1
self.f.append(tmp)
for k, v in tmp.iteritems():
if k not in self.df:
self.df[k] = 0
self.df[k] += 1
for k, v in self.df.iteritems():
self.idf[k] = math.log(self.d-v+0.5)-math.log(v+0.5)

def sim(self, doc, index):
score = 0
for word in doc:
if word not in self.f[index]:
continue
score += (self.idf[word]*self.f[index][word]*(self.k1+1)
/ (self.f[index][word]+self.k1*(1-self.b+self.b*self.D
/ self.avgdl)))
return score

def simall(self, doc):
scores = []
for index in range(self.D):
score = self.sim(doc, index)
scores.append(score)
return scores
49 changes: 49 additions & 0 deletions snownlp/summary/textrank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import bm25


class TextRank(object):

def __init__(self, docs):
self.docs = docs
self.bm25 = bm25.BM25(docs)
self.D = len(docs)
self.d = 0.85
self.weight = []
self.weight_sum = []
self.vertex = []
self.max_iter = 200
self.min_diff = 0.001
self.top = []

def solve(self):
for doc in self.docs:
scores = self.bm25.simall(doc)
self.weight.append(scores)
self.weight_sum.append(sum(self.weight[-1]))
self.vertex.append(1.0)
for _ in range(self.max_iter):
m = []
max_diff = 0
for i in range(self.D):
m.append(1-self.d)
for j in range(self.D):
if j == i:
continue
m[-1] += (self.d*self.weight[i][j]
/ self.weight_sum[j]*self.vertex[j])
if abs(m[-1] - self.vertex[i]) > max_diff:
max_diff = abs(m[-1] - self.vertex[i])
self.vertex = m
if max_diff <= self.max_dif:
break
self.top = list(enumerate(self.vertex))
self.top = sorted(self.vertex, key=lambda x: x[1], reverse=True)

def top_index(self, limit):
return map(lambda x: x[0], self.top)

def top(self, limit):
return map(lambda x: self.docs[x[0]], self.top)
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 comments on commit 3d66ca0

Please sign in to comment.