Skip to content

Commit

Permalink
seg add zip marshal
Browse files Browse the repository at this point in the history
  • Loading branch information
isnowfy committed Dec 12, 2013
1 parent 24e2247 commit 607417c
Show file tree
Hide file tree
Showing 7 changed files with 23 additions and 12 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ s.sim([u'文章'])# [0.3756070762985226, 0, 0]

## Features

* 中文分词([Character-Based Generative Model](http://aclweb.org/anthology//Y/Y09/Y09-2047.pdf)(TODO: 新算法导致字典过大,现在暂时去掉了字典但是第一次import时间稍微长一点)
* 中文分词([Character-Based Generative Model](http://aclweb.org/anthology//Y/Y09/Y09-2047.pdf)
* 词性标准([TnT](http://aclweb.org/anthology//A/A00/A00-1031.pdf) 3-gram 隐马)
* 情感分析
* 文本分类(Naive Bayes)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def read(fname):

setup(
name='snownlp',
version='0.9.5',
version='0.9.6',
description='Python library for processing Chinese text',
author='isnowfy',
url='https://github.com/isnowfy/snownlp',
Expand Down
4 changes: 2 additions & 2 deletions snownlp/seg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
from . import seg as TnTseg

data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'data.txt')
'seg.marshal')
segger = TnTseg.Seg('other')
segger.train(data_path)
segger.load(data_path, True)


def seg(sent):
Expand Down
Binary file added snownlp/seg/seg.marshal
Binary file not shown.
Binary file added snownlp/seg/seg.marshal.3
Binary file not shown.
8 changes: 4 additions & 4 deletions snownlp/seg/seg.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ def __init__(self, name='tnt'):
else:
self.segger = CharacterBasedGenerativeModel()

def save(self, fname):
self.segger.save(fname)
def save(self, fname, iszip=False):
self.segger.save(fname, iszip)

def load(self, fname):
self.segger.load(fname)
def load(self, fname, iszip=False):
self.segger.load(fname, iszip)

def train(self, file_name):
fr = codecs.open(file_name, 'r', 'utf-8')
Expand Down
19 changes: 15 additions & 4 deletions snownlp/seg/y09_2047.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import unicode_literals

import sys
import gzip
import marshal
from math import log

Expand All @@ -19,7 +20,7 @@ def __init__(self):
self.bi = frequency.NormalProb()
self.tri = frequency.NormalProb()

def save(self, fname):
def save(self, fname, iszip=False):
d = {}
for k, v in self.__dict__.items():
if hasattr(v, '__dict__'):
Expand All @@ -28,12 +29,22 @@ def save(self, fname):
d[k] = v
if sys.version_info.major == 3:
fname = fname + '.3'
marshal.dump(d, open(fname, 'wb'))
if not iszip:
marshal.dump(d, open(fname, 'wb'))
else:
f = gzip.open(fname, 'wb')
f.write(marshal.dumps(d))
f.close()

def load(self, fname):
def load(self, fname, iszip=False):
if sys.version_info.major == 3:
fname = fname + '.3'
d = marshal.load(open(fname, 'rb'))
if not iszip:
d = marshal.load(open(fname, 'rb'))
else:
f = gzip.open(fname, 'rb')
d = marshal.loads(f.read())
f.close()
for k, v in d.items():
if hasattr(self.__dict__[k], '__dict__'):
self.__dict__[k].__dict__ = v
Expand Down

0 comments on commit 607417c

Please sign in to comment.