Skip to content

Commit

Permalink
standard the train api
Browse files Browse the repository at this point in the history
  • Loading branch information
isnowfy committed Apr 5, 2014
1 parent 1fa7a67 commit 27c4dea
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 14 deletions.
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,15 @@ $ pip install snownlp
以分词为例
分词在`snownlp/seg`目录下
~~~~{python}
from snownlp.seg.seg import Seg
seg = Seg()
from snownlp import seg
seg.train('data.txt')
seg.save('seg.marshal')
#from snownlp import tag
#tag.train('199801.txt')
#tag.save('tag.marshal')
#from snownlp import sentiment
#sentiment.train('neg.txt', 'pos.txt')
#sentiment.save('sentiment.marshal')
~~~~
这样训练好的文件就存储为`seg.marshal`了,之后修改`snownlp/seg/__init__.py`里的`data_path`指向刚训练好的文件即可

Expand Down
23 changes: 19 additions & 4 deletions snownlp/classification/bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import unicode_literals

import sys
import gzip
import marshal
from math import log, exp

Expand All @@ -14,20 +15,34 @@ def __init__(self):
self.d = {}
self.total = 0

def save(self, fname):
def save(self, fname, iszip=True):
d = {}
d['total'] = self.total
d['d'] = {}
for k, v in self.d.items():
d['d'][k] = v.__dict__
if sys.version_info[0] == 3:
fname = fname + '.3'
marshal.dump(d, open(fname, 'wb'))
if not iszip:
marshal.dump(d, open(fname, 'wb'))
else:
f = gzip.open(fname, 'wb')
f.write(marshal.dumps(d))
f.close()

def load(self, fname):
def load(self, fname, iszip=True):
if sys.version_info[0] == 3:
fname = fname + '.3'
d = marshal.load(open(fname, 'rb'))
if not iszip:
d = marshal.load(open(fname, 'rb'))
else:
try:
f = gzip.open(fname, 'rb')
d = marshal.loads(f.read())
except IOError:
f = open(fname, 'rb')
d = marshal.loads(f.read())
f.close()
self.total = d['total']
self.d = {}
for k, v in d['d'].items():
Expand Down
12 changes: 12 additions & 0 deletions snownlp/seg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,17 @@ def seg(sent):
return words


def train(fname):
segger.train(fname)


def save(fname, iszip=True):
segger.save(fname, iszip)


def load(fname, iszip=True):
segger.load(fname, iszip)


def single_seg(sent):
return list(segger.seg(sent))
4 changes: 2 additions & 2 deletions snownlp/seg/seg.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ def save(self, fname, iszip=True):
def load(self, fname, iszip=True):
self.segger.load(fname, iszip)

def train(self, file_name):
fr = codecs.open(file_name, 'r', 'utf-8')
def train(self, fname):
fr = codecs.open(fname, 'r', 'utf-8')
data = []
for i in fr:
line = i.strip()
Expand Down
23 changes: 19 additions & 4 deletions snownlp/sentiment/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import unicode_literals

import os
import codecs

from .. import normal
from .. import seg
Expand All @@ -16,11 +17,11 @@ class Sentiment(object):
def __init__(self):
self.classifier = Bayes()

def save(self, fname):
self.classifier.save(fname)
def save(self, fname, iszip=True):
self.classifier.save(fname, iszip)

def load(self, fname=data_path):
self.classifier.load(fname)
def load(self, fname=data_path, iszip=True):
self.classifier.load(fname, iszip)

def handle(self, doc):
words = seg.seg(doc)
Expand All @@ -46,5 +47,19 @@ def classify(self, sent):
classifier.load()


def train(neg_file, pos_file):
neg_docs = codecs.open(neg_file, 'r', 'utf-8').readlines()
pos_docs = codecs.open(pos_file, 'r', 'utf-8').readlines()
classifier.train(neg_docs, pos_docs)


def save(fname, iszip=True):
classifier.save(fname, iszip)


def load(fname, iszip=True):
classifier.load(fname, iszip)


def classify(sent):
return classifier.classify(sent)
Binary file modified snownlp/sentiment/sentiment.marshal
Binary file not shown.
Binary file modified snownlp/sentiment/sentiment.marshal.3
Binary file not shown.
12 changes: 10 additions & 2 deletions snownlp/tag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
tagger.load(data_path)


def train(file_name):
fr = codecs.open(file_name, 'r', 'utf-8')
def train(fname):
fr = codecs.open(fname, 'r', 'utf-8')
data = []
for i in fr:
line = i.strip()
Expand All @@ -25,6 +25,14 @@ def train(file_name):
tagger.train(data)


def save(fname, iszip=True):
tagger.save(fname, iszip)


def load(fname, iszip=True):
tagger.load(fname, iszip)


def tag_all(words):
return tagger.tag(words)

Expand Down

0 comments on commit 27c4dea

Please sign in to comment.