add ignore

GDStephen · Nov 29, 2013 · 4974d0c · 4974d0c
1 parent e90e3f5
commit 4974d0c
Show file tree

Hide file tree

Showing 9 changed files with 75 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.py[cod]
diff --git a/snownlp/normal/__init__.py b/snownlp/normal/__init__.py
@@ -1,2 +1,23 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
+
+import os
+import codecs
+
+import zh
+
+data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                         'stopwords')
+stop = set()
+fr = codecs.open(data_path, 'r', 'utf-8')
+for word in fr:
+    stop.add(word.strip())
+fr.close()
+
+
+def filter_stop(words):
+    return filter(lambda x: x not in stop, words)
+
+
+def zh2hans(sent):
+    return zh.transfer(sent)
diff --git a/snownlp/seg/__init__.py b/snownlp/seg/__init__.py
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import os
+
+import seg as TnTseg
+
+data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                         'data.txt')
+segger = TnTseg.Seg()
+segger.train(data_path)
+
+
+def seg(sent):
+    return list(segger.seg(sent))
diff --git a/snownlp/seg/seg.py b/snownlp/seg/seg.py
@@ -4,13 +4,13 @@
 
 import codecs
 
-import tnt
+from ..utils.tnt import TnT
 
 
 class Seg(object):
 
     def __init__(self):
-        self.segger = tnt.TnT()
+        self.segger = TnT()
 
     def train(self, file_name):
         fr = codecs.open(file_name, 'r', 'utf-8')

diff --git a/snownlp/sim/__init__.py b/snownlp/sim/__init__.py
diff --git a/snownlp/summary/__init__.py b/snownlp/summary/__init__.py
diff --git a/snownlp/summary/textrank.py b/snownlp/summary/textrank.py
@@ -1,14 +1,14 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
-import bm25
+from ..sim.bm25 import BM25
 
 
 class TextRank(object):
 
     def __init__(self, docs):
         self.docs = docs
-        self.bm25 = bm25.BM25(docs)
+        self.bm25 = BM25(docs)
         self.D = len(docs)
         self.d = 0.85
         self.weight = []

diff --git a/snownlp/tag/__init__.py b/snownlp/tag/__init__.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import os
+import codecs
+
+from ..utils.tnt import TnT
+
+data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                         '199801.txt')
+tagger = TnT()
+
+
+def train(file_name):
+    fr = codecs.open(file_name, 'r', 'utf-8')
+    data = []
+    for i in fr:
+        line = i.strip()
+        if not line:
+            continue
+        tmp = map(lambda x: x.split('/'), line.split())
+        data.append(tmp)
+    fr.close()
+    tagger.train(data)
+
+train(data_path)
+
+
+def tag_all(words):
+    return tagger.tag(words)
+
+
+def tag(words):
+    return map(lambda x: x[1], tag_all(words))
diff --git a/snownlp/utils/__init__.py b/snownlp/utils/__init__.py