Skip to content

Commit

Permalink
Live Demo and reset function added
Browse files Browse the repository at this point in the history
  • Loading branch information
Newt0n committed Mar 29, 2013
1 parent 280376d commit b579012
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 10 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ Progress
===
### Text Extractor
* 正文提取,参考 [基于行块分布函数的通用网页正文抽取算法](https://cx-extractor.googlecode.com/files/%E5%9F%BA%E4%BA%8E%E8%A1%8C%E5%9D%97%E5%88%86%E5%B8%83%E5%87%BD%E6%95%B0%E7%9A%84%E9%80%9A%E7%94%A8%E7%BD%91%E9%A1%B5%E6%AD%A3%E6%96%87%E6%8A%BD%E5%8F%96%E7%AE%97%E6%B3%95.pdf) 实现 Python 版本做粗略文本提取。
* Example: ```python extractor.py http://hb.qq.com/a/20130324/000235.htm```
* Example: ```python extractor.py http://hb.qq.com/a/20130324/000235.htm```
* Live Demo: [https://py-newt0n.rhcloud.com/](https://py-newt0n.rhcloud.com/)
24 changes: 15 additions & 9 deletions extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,7 @@

class Extractor(object):

rawPage = ''
text = ''
isGB = True
textLines = []
blocksLen = []
isCharsetGB = True

def __init__(self, url, blockSize=3):
def __init__(self, url='', blockSize=3):
self.url = url
self.blockSize = blockSize

Expand All @@ -41,6 +34,14 @@ def __init__(self, url, blockSize=3):
# Reduce redundancy
self.reRedun = re.compile('\n{%s,}' % (self.blockSize+1))

def reset(self):
self.rawPage = ''
self.text = ''
self.isGB = True
self.textLines = []
self.blocksLen = []
self.isCharsetGB = True

def getRawPage(self):
self.rawPage = urllib2.urlopen(self.url).read()

Expand Down Expand Up @@ -89,10 +90,15 @@ def calcBlockLens(self):
self.blocksLen.append(blockLen)

# Merge the most possibile blocks as the final plaintext
def getPlainText(self):
def getPlainText(self, url=''):
self.reset()
if url:
self.url = url
self.getRawPage()
self.handleEncoding()
preProcDoc = self.preProcess(self.rawPage)
# f = open('dump')
# preProcDoc = f.read()
self.getTextLines(preProcDoc)
self.calcBlockLens()

Expand Down

0 comments on commit b579012

Please sign in to comment.