Skip to content

Commit

Permalink
修复 observerss#1, 并添加了一个简单的测试
Browse files Browse the repository at this point in the history
  • Loading branch information
observerss committed Oct 27, 2014
1 parent a60a2a8 commit 0b9269e
Showing 1 changed file with 63 additions and 47 deletions.
110 changes: 63 additions & 47 deletions filter.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,59 @@
#!/usr/bin/env python
#-*- coding:utf-8 -*-
# -*- coding:utf-8 -*-
from collections import defaultdict
import re
import os

__all__ = ['NaiveFilter','BSFilter','DFAFilter']
__all__ = ['NaiveFilter', 'BSFilter', 'DFAFilter']
__author__ = 'observer'
__date__ = '2012.01.05'


class NaiveFilter():

'''Filter Messages from keywords
very simple filter implementation
>>> f = NaiveFilter()
>>> f.add("sexy")
>>> f.filter("hello sexy baby")
hello **** baby
'''

def __init__(self):
self.keywords = set([])
def parse(self,path):

def parse(self, path):
for keyword in open(path):
self.keywords.add(keyword.strip().decode('utf-8').lower())

def filter(self,message,repl="*"):
def filter(self, message, repl="*"):
message = unicode(message).lower()
for kw in self.keywords:
message = message.replace(kw,repl)
message = message.replace(kw, repl)
return message


class BSFilter:

'''Filter Messages from keywords
Use Back Sorted Mapping to reduce replacement times
>>> f = BSFilter()
>>> f.add("sexy")
>>> f.filter("hello sexy baby")
hello **** baby
'''

def __init__(self):
self.keywords = []
self.kwsets = set([])
self.bsdict = defaultdict(set)
self.pat_en = re.compile(r'^[0-9a-zA-Z]+$') # english phrase or not
def add(self,keyword):
if not isinstance(keyword,unicode):
self.pat_en = re.compile(r'^[0-9a-zA-Z]+$') # english phrase or not

def add(self, keyword):
if not isinstance(keyword, unicode):
keyword = keyword.decode('utf-8')
keyword = keyword.lower()
if keyword not in self.kwsets:
Expand All @@ -63,41 +67,44 @@ def add(self,keyword):
for char in word:
self.bsdict[char].add(index)

def parse(self,path):
with open(path,"r") as f:
def parse(self, path):
with open(path, "r") as f:
for keyword in f:
self.add(keyword.strip())

def filter(self,message,repl="*"):
if not isinstance(message,unicode):
def filter(self, message, repl="*"):
if not isinstance(message, unicode):
message = message.decode('utf-8')
message = message.lower()
for word in message.split():
if self.pat_en.search(word):
for index in self.bsdict[word]:
message = message.replace( self.keywords[index], repl )
message = message.replace(self.keywords[index], repl)
else:
for char in word:
for index in self.bsdict[char]:
message = message.replace( self.keywords[index], repl )
message = message.replace(self.keywords[index], repl)
return message


class DFAFilter():

'''Filter Messages from keywords
Use DFA to keep algorithm perform constantly
>>> f = DFAFilter()
>>> f.add("sexy")
>>> f.filter("hello sexy baby")
hello **** baby
'''

def __init__(self):
self.keyword_chains = {}
self.delimit = '\x00'
def add(self,keyword):
if not isinstance(keyword,unicode):

def add(self, keyword):
if not isinstance(keyword, unicode):
keyword = keyword.decode('utf-8')
keyword = keyword.lower()
chars = keyword.strip()
Expand All @@ -106,27 +113,26 @@ def add(self,keyword):
level = self.keyword_chains
for i in range(len(chars)):
if chars[i] in level:
level = level[ chars[i] ]
level = level[chars[i]]
else:
if not isinstance(level,dict):
if not isinstance(level, dict):
break
for j in range(i,len(chars)):
level[ chars[j] ] = {}
last_level,last_char = level,chars[j]
level = level[ chars[j] ]
last_level[last_char] = {self.delimit:0}
for j in range(i, len(chars)):
level[chars[j]] = {}
last_level, last_char = level, chars[j]
level = level[chars[j]]
last_level[last_char] = {self.delimit: 0}
break
if i == len(chars)-1:
if i == len(chars) - 1:
level[self.delimit] = 0


def parse(self,path):
def parse(self, path):
with open(path) as f:
for keyword in f:
self.add(keyword.strip())

def filter(self,message,repl="*"):
if not isinstance(message,unicode):
def filter(self, message, repl="*"):
if not isinstance(message, unicode):
message = message.decode('utf-8')
message = message.lower()
ret = []
Expand All @@ -137,29 +143,39 @@ def filter(self,message,repl="*"):
for char in message[start:]:
if char in level:
step_ins += 1
if self.delimit not in level[ char ]:
level = level[ char ]
if self.delimit not in level[char]:
level = level[char]
else:
ret.append( repl*step_ins )
ret.append(repl * step_ins)
start += step_ins - 1
break
else:
ret.append( message[start] )
ret.append(message[start])
break
else:
ret.append(message[start])
start += 1

return ''.join(ret)


def test_first_character():
gfw = DFAFilter()
gfw.add("1989年")
assert gfw.filter("1989", "*") == "1989"


if __name__ == "__main__":
#gfw = NaiveFilter()
#gfw = BSFilter()
# gfw = NaiveFilter()
# gfw = BSFilter()
gfw = DFAFilter()
gfw.parse("keywords")
import time
t= time.time()
print gfw.filter("法轮功 我操操操","*")
print gfw.filter("针孔摄像机 我操操操","*")
print gfw.filter("售假人民币 我操操操","*")
print gfw.filter("传世私服 我操操操","*")
t = time.time()
print gfw.filter("法轮功 我操操操", "*")
print gfw.filter("针孔摄像机 我操操操", "*")
print gfw.filter("售假人民币 我操操操", "*")
print gfw.filter("传世私服 我操操操", "*")
print time.time() - t

test_first_character()

0 comments on commit 0b9269e

Please sign in to comment.