Skip to content

Commit

Permalink
change seg algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
isnowfy committed Dec 10, 2013
1 parent 11cfbcc commit 549986a
Show file tree
Hide file tree
Showing 12 changed files with 127 additions and 15 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ s.tags # [(u'这个', u'r'), (u'东西', u'n'),
# (u'真心', u'd'), (u'很', u'd'),
# (u'赞', u'Vg')]
s.sentiments # 0.9911384166031976 positive的概率
s.sentiments # 0.9830157237610916 positive的概率
s.pinyin # [u'zhe', u'ge', u'dong', u'xi',
# u'zhen', u'xin', u'hen', u'zan']
Expand All @@ -41,9 +41,10 @@ s.keywords(3) # [u'语言', u'自然', u'计算机']
s.summary(3) # [u'自然语言处理是一门融语言学、计算机科学、
# 数学于一体的科学',
# u'即人们日常使用的语言',
# u'自然语言处理是计算机科学领域与人工智能
# 领域中的一个重要方向']
# 领域中的一个重要方向',
# u'而在于研制能有效地实现自然语言通信的计
# 算机系统']
s.sentences
s = SnowNLP([[u'这篇', u'文章'],
Expand All @@ -56,7 +57,7 @@ s.sim([u'文章'])# [0.3756070762985226, 0, 0]

## Features

* 中文分词([TnT](http://aclweb.org/anthology//A/A00/A00-1031.pdf) 3-gram 隐马
* 中文分词([Character-Based Generative Model](http://aclweb.org/anthology//Y/Y09/Y09-2047.pdf)
* 词性标准([TnT](http://aclweb.org/anthology//A/A00/A00-1031.pdf) 3-gram 隐马)
* 情感分析
* 文本分类(Naive Bayes)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def read(fname):

setup(
name='snownlp',
version='0.9.3',
version='0.9.5',
description='Python library for processing Chinese text',
author='isnowfy',
url='https://github.com/isnowfy/snownlp',
Expand Down
2 changes: 1 addition & 1 deletion snownlp/classification/bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def save(self, fname):
d = {}
d['total'] = self.total
d['d'] = {}
for k, v in self.d.iteritems():
for k, v in self.d.items():
d['d'][k] = v.__dict__
if sys.version_info.major == 3:
fname = fname + '.3'
Expand Down
6 changes: 6 additions & 0 deletions snownlp/normal/stopwords.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@
[
]
Expand Down
6 changes: 3 additions & 3 deletions snownlp/seg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
from . import seg as TnTseg

data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'seg.marshal')
segger = TnTseg.Seg()
segger.load(data_path)
'data.txt')
segger = TnTseg.Seg('other')
segger.train(data_path)


def seg(sent):
Expand Down
Binary file removed snownlp/seg/seg.marshal
Binary file not shown.
Binary file removed snownlp/seg/seg.marshal.3
Binary file not shown.
18 changes: 13 additions & 5 deletions snownlp/seg/seg.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,16 @@
import codecs

from ..utils.tnt import TnT
from .y09_2047 import CharacterBasedGenerativeModel


class Seg(object):

def __init__(self):
self.segger = TnT()
def __init__(self, name='tnt'):
if name == 'tnt':
self.segger = TnT()
else:
self.segger = CharacterBasedGenerativeModel()

def save(self, fname):
self.segger.save(fname)
Expand All @@ -34,13 +38,17 @@ def seg(self, sentence):
ret = self.segger.tag(sentence)
tmp = ''
for i in ret:
if i[1] == 's':
yield i[0]
elif i[1] == 'e':
if i[1] == 'e':
yield tmp+i[0]
tmp = ''
elif i[1] == 'b' or i[1] == 's':
if tmp:
yield tmp
tmp = i[0]
else:
tmp += i[0]
if tmp:
yield tmp


if __name__ == '__main__':
Expand Down
97 changes: 97 additions & 0 deletions snownlp/seg/y09_2047.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import sys
import marshal
from math import log

from ..utils import frequency


class CharacterBasedGenerativeModel(object):

def __init__(self):
self.l1 = 0.0
self.l2 = 0.0
self.l3 = 0.0
self.status = ('b', 'm', 'e', 's')
self.uni = frequency.NormalProb()
self.bi = frequency.NormalProb()
self.tri = frequency.NormalProb()

def save(self, fname):
d = {}
for k, v in self.__dict__.items():
if hasattr(v, '__dict__'):
d[k] = v.__dict__
else:
d[k] = v
if sys.version_info.major == 3:
fname = fname + '.3'
marshal.dump(d, open(fname, 'wb'))

def load(self, fname):
if sys.version_info.major == 3:
fname = fname + '.3'
d = marshal.load(open(fname, 'rb'))
for k, v in d.items():
if hasattr(self.__dict__[k], '__dict__'):
self.__dict__[k].__dict__ = v
else:
self.__dict__[k] = v

def div(self, v1, v2):
if v2 == 0:
return 0
return float(v1)/v2

def train(self, data):
now = [('', 'BOS'), ('', 'BOS')]
for sentence in data:
self.bi.add((('', 'BOS'), ('', 'BOS')), 1)
self.uni.add(('', 'BOS'), 2)
for word, tag in sentence:
now.append((word, tag))
self.uni.add((word, tag), 1)
self.bi.add(tuple(now[1:]), 1)
self.tri.add(tuple(now), 1)
now.pop(0)
tl1 = 0.0
tl2 = 0.0
tl3 = 0.0
for now in self.tri.samples():
c3 = self.div(self.tri.get(now)[1]-1, self.bi.get(now[:2])[1]-1)
c2 = self.div(self.bi.get(now[1:])[1]-1, self.uni.get(now[1])[1]-1)
c1 = self.div(self.uni.get(now[2])[1]-1, self.uni.getsum()-1)
if c3 >= c1 and c3 >= c2:
tl3 += self.tri.get(now)[1]
elif c2 >= c1 and c2 >= c3:
tl2 += self.tri.get(now)[1]
elif c1 >= c2 and c1 >= c3:
tl1 += self.tri.get(now)[1]
self.l1 = self.div(tl1, tl1+tl2+tl3)
self.l2 = self.div(tl2, tl1+tl2+tl3)
self.l3 = self.div(tl3, tl1+tl2+tl3)

def log_prob(self, s1, s2, s3):
uni = self.l1*self.uni.freq(s3)
bi = self.div(self.l2*self.bi.get((s2, s3))[1], self.uni.get(s2)[1])
tri = self.div(self.l3*self.tri.get((s1, s2, s3))[1],
self.bi.get((s1, s2))[1])
if uni+bi+tri == 0:
return float('-inf')
return log(uni+bi+tri)

def tag(self, data):
now = [((('', 'BOS'), ('', 'BOS')), 0.0, [])]
for w in data:
stage = {}
for s in self.status:
for pre in now:
p = pre[1]+self.log_prob(pre[0][0], pre[0][1], (w, s))
if (not (pre[0][1],
(w, s)) in stage) or p > stage[(pre[0][1],
(w, s))][0]:
stage[(pre[0][1], (w, s))] = (p, pre[2]+[s])
now = list(map(lambda x: (x[0], x[1][0], x[1][1]), stage.items()))
return zip(data, max(now, key=lambda x: x[1])[2])
Binary file modified snownlp/sentiment/sentiment.marshal
Binary file not shown.
Binary file modified snownlp/sentiment/sentiment.marshal.3
Binary file not shown.
2 changes: 1 addition & 1 deletion snownlp/utils/tnt.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __init__(self, N=1000):

def save(self, fname):
d = {}
for k, v in self.__dict__.iteritems():
for k, v in self.__dict__.items():
if isinstance(v, set):
d[k] = list(v)
elif hasattr(v, '__dict__'):
Expand Down

0 comments on commit 549986a

Please sign in to comment.