Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
isnowfy committed Mar 31, 2014
1 parent 796eeeb commit 1fa7a67
Show file tree
Hide file tree
Showing 9 changed files with 35 additions and 16 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,9 @@ $ pip install snownlp
from snownlp.seg.seg import Seg
seg = Seg()
seg.train('data.txt')
seg.save('data.marshal')
seg.save('seg.marshal')
~~~~
这样训练好的文件就存储为`data.marshal`了,之后修改`snownlp/seg/__init__.py`里的`data_path`指向刚训练好的文件即可
这样训练好的文件就存储为`seg.marshal`了,之后修改`snownlp/seg/__init__.py`里的`data_path`指向刚训练好的文件即可

## License

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def read(fname):

setup(
name='snownlp',
version='0.9.10',
version='0.10.1',
description='Python library for processing Chinese text',
author='isnowfy',
url='https://github.com/isnowfy/snownlp',
Expand Down
Binary file modified snownlp/seg/seg.marshal
Binary file not shown.
Binary file modified snownlp/seg/seg.marshal.3
Binary file not shown.
6 changes: 3 additions & 3 deletions snownlp/seg/seg.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@

class Seg(object):

def __init__(self, name='tnt'):
def __init__(self, name='other'):
if name == 'tnt':
self.segger = TnT()
else:
self.segger = CharacterBasedGenerativeModel()

def save(self, fname, iszip=False):
def save(self, fname, iszip=True):
self.segger.save(fname, iszip)

def load(self, fname, iszip=False):
def load(self, fname, iszip=True):
self.segger.load(fname, iszip)

def train(self, file_name):
Expand Down
14 changes: 9 additions & 5 deletions snownlp/seg/y09_2047.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def __init__(self):
self.bi = frequency.NormalProb()
self.tri = frequency.NormalProb()

def save(self, fname, iszip=False):
def save(self, fname, iszip=True):
d = {}
for k, v in self.__dict__.items():
if hasattr(v, '__dict__'):
Expand All @@ -36,14 +36,18 @@ def save(self, fname, iszip=False):
f.write(marshal.dumps(d))
f.close()

def load(self, fname, iszip=False):
def load(self, fname, iszip=True):
if sys.version_info[0] == 3:
fname = fname + '.3'
if not iszip:
d = marshal.load(open(fname, 'rb'))
else:
f = gzip.open(fname, 'rb')
d = marshal.loads(f.read())
try:
f = gzip.open(fname, 'rb')
d = marshal.loads(f.read())
except IOError:
f = open(fname, 'rb')
d = marshal.loads(f.read())
f.close()
for k, v in d.items():
if hasattr(self.__dict__[k], '__dict__'):
Expand All @@ -57,8 +61,8 @@ def div(self, v1, v2):
return float(v1)/v2

def train(self, data):
now = [('', 'BOS'), ('', 'BOS')]
for sentence in data:
now = [('', 'BOS'), ('', 'BOS')]
self.bi.add((('', 'BOS'), ('', 'BOS')), 1)
self.uni.add(('', 'BOS'), 2)
for word, tag in sentence:
Expand Down
Binary file modified snownlp/tag/tag.marshal
Binary file not shown.
Binary file modified snownlp/tag/tag.marshal.3
Binary file not shown.
25 changes: 20 additions & 5 deletions snownlp/utils/tnt.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from __future__ import unicode_literals

import sys
import gzip
import heapq
import marshal
from math import log
Expand All @@ -30,7 +31,7 @@ def __init__(self, N=1000):
self.word = {}
self.trans = {}

def save(self, fname):
def save(self, fname, iszip=True):
d = {}
for k, v in self.__dict__.items():
if isinstance(v, set):
Expand All @@ -41,12 +42,26 @@ def save(self, fname):
d[k] = v
if sys.version_info[0] == 3:
fname = fname + '.3'
marshal.dump(d, open(fname, 'wb'))
if not iszip:
marshal.dump(d, open(fname, 'wb'))
else:
f = gzip.open(fname, 'wb')
f.write(marshal.dumps(d))
f.close()

def load(self, fname):
def load(self, fname, iszip=True):
if sys.version_info[0] == 3:
fname = fname + '.3'
d = marshal.load(open(fname, 'rb'))
if not iszip:
d = marshal.load(open(fname, 'rb'))
else:
try:
f = gzip.open(fname, 'rb')
d = marshal.loads(f.read())
except IOError:
f = open(fname, 'rb')
d = marshal.loads(f.read())
f.close()
for k, v in d.items():
if isinstance(self.__dict__[k], set):
self.__dict__[k] = set(v)
Expand All @@ -67,8 +82,8 @@ def geteos(self, tag):
return log(self.eos.get((tag, 'EOS'))[1])-log(self.eosd.get(tag)[1])

def train(self, data):
now = ['BOS', 'BOS']
for sentence in data:
now = ['BOS', 'BOS']
self.bi.add(('BOS', 'BOS'), 1)
self.uni.add('BOS', 2)
for word, tag in sentence:
Expand Down

0 comments on commit 1fa7a67

Please sign in to comment.