Skip to content

Commit

Permalink
add marshal serialization
Browse files Browse the repository at this point in the history
  • Loading branch information
isnowfy committed Dec 7, 2013
1 parent 976ce51 commit 8f0bebd
Show file tree
Hide file tree
Showing 10 changed files with 46 additions and 16 deletions.
6 changes: 3 additions & 3 deletions snownlp/classification/bayes.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import json
import marshal
from math import log, exp

from ..utils.frequency import AddOneProb
Expand All @@ -19,10 +19,10 @@ def save(self, fname):
d['d'] = {}
for k, v in self.d.iteritems():
d['d'][k] = v.__dict__
json.dump(d, open(fname, 'w'))
marshal.dump(d, open(fname, 'w'))

def load(self, fname):
d = json.load(open(fname, 'r'))
d = marshal.load(open(fname, 'r'))
self.total = d['total']
self.d = {}
for k, v in d['d'].iteritems():
Expand Down
4 changes: 2 additions & 2 deletions snownlp/seg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
import seg as TnTseg

data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'data.txt')
'seg.marshal')
segger = TnTseg.Seg()
segger.train(data_path)
segger.load(data_path)


def seg(sent):
Expand Down
Binary file added snownlp/seg/seg.marshal
Binary file not shown.
6 changes: 6 additions & 0 deletions snownlp/seg/seg.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ class Seg(object):
def __init__(self):
self.segger = TnT()

def save(self, fname):
self.segger.save(fname)

def load(self, fname):
self.segger.load(fname)

def train(self, file_name):
fr = codecs.open(file_name, 'r', 'utf-8')
data = []
Expand Down
2 changes: 1 addition & 1 deletion snownlp/sentiment/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from ..classification.bayes import Bayes

data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'sentiment.json')
'sentiment.marshal')


class Sentiment(object):
Expand Down
1 change: 0 additions & 1 deletion snownlp/sentiment/sentiment.json

This file was deleted.

Binary file added snownlp/sentiment/sentiment.marshal
Binary file not shown.
5 changes: 2 additions & 3 deletions snownlp/tag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
from ..utils.tnt import TnT

data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'199801.txt')
'tag.marshal')
tagger = TnT()
tagger.load(data_path)


def train(file_name):
Expand All @@ -23,8 +24,6 @@ def train(file_name):
fr.close()
tagger.train(data)

train(data_path)


def tag_all(words):
return tagger.tag(words)
Expand Down
Binary file added snownlp/tag/tag.marshal
Binary file not shown.
38 changes: 32 additions & 6 deletions snownlp/utils/tnt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
'''

import heapq
import marshal
from math import log

import frequency


class TnT(object):

def __init__(self, N=1000):
Expand All @@ -26,6 +28,27 @@ def __init__(self, N=1000):
self.word = {}
self.trans = {}

def save(self, fname):
d = {}
for k, v in self.__dict__.iteritems():
if isinstance(v, set):
d[k] = list(v)
elif hasattr(v, '__dict__'):
d[k] = v.__dict__
else:
d[k] = v
marshal.dump(d, open(fname, 'w'))

def load(self, fname):
d = marshal.load(open(fname, 'r'))
for k, v in d.iteritems():
if isinstance(self.__dict__[k], set):
self.__dict__[k] = set(v)
elif hasattr(self.__dict__[k], '__dict__'):
self.__dict__[k].__dict__ = v
else:
self.__dict__[k] = v

def tnt_div(self, v1, v2):
if v2 == 0:
return 0
Expand Down Expand Up @@ -60,8 +83,10 @@ def train(self, data):
tl2 = 0.0
tl3 = 0.0
for now in self.tri.samples():
c3 = self.tnt_div(self.tri.get(now)[1]-1, self.bi.get(now[:2])[1]-1)
c2 = self.tnt_div(self.bi.get(now[1:])[1]-1, self.uni.get(now[1])[1]-1)
c3 = self.tnt_div(self.tri.get(now)[1]-1,
self.bi.get(now[:2])[1]-1)
c2 = self.tnt_div(self.bi.get(now[1:])[1]-1,
self.uni.get(now[1])[1]-1)
c1 = self.tnt_div(self.uni.get(now[2])[1]-1, self.uni.getsum()-1)
if c3 >= c1 and c3 >= c2:
tl3 += self.tri.get(now)[1]
Expand All @@ -72,8 +97,8 @@ def train(self, data):
self.l1 = float(tl1)/(tl1+tl2+tl3)
self.l2 = float(tl2)/(tl1+tl2+tl3)
self.l3 = float(tl3)/(tl1+tl2+tl3)
for s1 in self.status|set(('BOS',)):
for s2 in self.status|set(('BOS',)):
for s1 in self.status | set(('BOS',)):
for s2 in self.status | set(('BOS',)):
for s3 in self.status:
uni = self.l1*self.uni.freq(s3)
bi = self.tnt_div(self.l2*self.bi.get((s2, s3))[1],
Expand All @@ -93,9 +118,10 @@ def tag(self, data):
wd = log(self.wd.get((s, w))[1])-log(self.uni.get(s)[1])
for pre in now:
p = pre[1]+wd+self.trans[(pre[0][0], pre[0][1], s)]
if (pre[0][1], s) not in stage or p > stage[(pre[0][1], s)][0]:
if (pre[0][1], s) not in stage or p > stage[(pre[0][1],
s)][0]:
stage[(pre[0][1], s)] = (p, pre[2]+[s])
stage = map(lambda x: (x[0], x[1][0], x[1][1]), stage.items())
now = heapq.nlargest(self.N, stage, key=lambda x:x[1])
now = heapq.nlargest(self.N, stage, key=lambda x: x[1])
now = heapq.nlargest(1, stage, key=lambda x: x[1]+self.geteos(x[0][1]))
return zip(data, now[0][2])

0 comments on commit 8f0bebd

Please sign in to comment.