fix isnowfy#32

fmchan · Sep 27, 2015 · 3c1c528 · 3c1c528
1 parent 262aa10
commit 3c1c528
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 9 deletions.
diff --git a/setup.py b/setup.py
@@ -45,7 +45,7 @@ def read(fname):
 
 setup(
     name='snownlp',
-    version='0.12.2',
+    version='0.12.3',
     description='Python library for processing Chinese text',
     author='isnowfy',
     url='https://github.com/isnowfy/snownlp',

diff --git a/snownlp/seg/y09_2047.py b/snownlp/seg/y09_2047.py
@@ -74,7 +74,8 @@ def train(self, data):
         tl1 = 0.0
         tl2 = 0.0
         tl3 = 0.0
-        for now in self.tri.samples():
+        samples = sorted(self.tri.samples(), key=lambda x: self.tri.get(x)[1])
+        for now in samples:
             c3 = self.div(self.tri.get(now)[1]-1, self.bi.get(now[:2])[1]-1)
             c2 = self.div(self.bi.get(now[1:])[1]-1, self.uni.get(now[1])[1]-1)
             c1 = self.div(self.uni.get(now[2])[1]-1, self.uni.getsum()-1)
@@ -101,6 +102,18 @@ def tag(self, data):
         now = [((('', 'BOS'), ('', 'BOS')), 0.0, [])]
         for w in data:
             stage = {}
+            not_found = True
+            for s in self.status:
+                if self.uni.freq((w, s)) != 0:
+                    not_found = False
+                    break
+            if not_found:
+                for s in self.status:
+                    for pre in now:
+                        stage[(pre[0][1], (w, s))] = (pre[1], pre[2]+[s])
+                now = list(map(lambda x: (x[0], x[1][0], x[1][1]),
+                               stage.items()))
+                continue
             for s in self.status:
                 for pre in now:
                     p = pre[1]+self.log_prob(pre[0][0], pre[0][1], (w, s))

diff --git a/snownlp/summary/textrank.py b/snownlp/summary/textrank.py
@@ -79,14 +79,20 @@ def solve(self):
         for _ in range(self.max_iter):
             m = {}
             max_diff = 0
-            for k, v in self.words.items():
-                m[k] = 1-self.d
-                for j in v:
-                    if k == j or len(self.words[j]) == 0:
+            tmp = filter(lambda x: len(self.words[x[0]]) > 0,
+                         self.vertex.items())
+            tmp = sorted(tmp, key=lambda x: x[1] / len(self.words[x[0]]))
+            for k, v in tmp:
+                for j in self.words[k]:
+                    if k == j:
                         continue
-                    m[k] += (self.d/len(self.words[j])*self.vertex[j])
-                if abs(m[k] - self.vertex[k]) > max_diff:
-                    max_diff = abs(m[k] - self.vertex[k])
+                    if j not in m:
+                        m[j] = 1 - self.d
+                    m[j] += (self.d / len(self.words[k]) * self.vertex[k])
+            for k in self.vertex:
+                if k in m and k in self.vertex:
+                    if abs(m[k] - self.vertex[k]) > max_diff:
+                        max_diff = abs(m[k] - self.vertex[k])
             self.vertex = m
             if max_diff <= self.min_diff:
                 break