Skip to content

Commit

Permalink
python3.6 编译通过,修复macos下面打开html乱码问题。
Browse files Browse the repository at this point in the history
  • Loading branch information
Your Name committed May 3, 2018
1 parent 1f92ccb commit ef978d5
Show file tree
Hide file tree
Showing 16 changed files with 5,567 additions and 22,695 deletions.
7 changes: 6 additions & 1 deletion ch07/ControlNode/DataOutput.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ def __init__(self):
self.filepath='baike_%s.html'%(time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) )
self.output_head(self.filepath)
self.datas=[]


def store_data(self,data):
if data is None:
return
Expand All @@ -21,10 +23,12 @@ def output_head(self,path):
'''
fout=codecs.open(path,'w',encoding='utf-8')
fout.write("<html>")
fout.write(r'''<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />''')
fout.write("<body>")
fout.write("<table>")
fout.close()


def output_html(self,path):
'''
将数据写入HTML文件中
Expand All @@ -37,10 +41,11 @@ def output_html(self,path):
fout.write("<td>%s</td>"%data['url'])
fout.write("<td>%s</td>"%data['title'])
fout.write("<td>%s</td>"%data['summary'])
fout.write("</tr>")
fout.write("</tr>")
self.datas=[]
fout.close()


def ouput_end(self,path):
'''
输出HTML结束
Expand Down
28 changes: 14 additions & 14 deletions ch07/ControlNode/NodeManager.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#coding:utf-8
#coding:utf-8

from multiprocessing.managers import BaseManager

import time

from multiprocessing import Process, Queue

from DataOutput import DataOutput
from UrlManager import UrlManager
from .DataOutput import DataOutput
from .UrlManager import UrlManager


class NodeManager(object):
Expand All @@ -24,12 +24,11 @@ def start_Manager(self,url_q,result_q):
BaseManager.register('get_task_queue',callable=lambda:url_q)
BaseManager.register('get_result_queue',callable=lambda:result_q)
#绑定端口8001,设置验证口令‘baike’。这个相当于对象的初始化
manager=BaseManager(address=('',8001),authkey='baike')
manager = BaseManager(address=('', 8001), authkey='baike'.encode('utf-8'))
#返回manager对象
return manager



def url_manager_proc(self,url_q,conn_q,root_url):
url_manager = UrlManager()
url_manager.add_new_url(root_url)
Expand All @@ -40,22 +39,21 @@ def url_manager_proc(self,url_q,conn_q,root_url):
new_url = url_manager.get_new_url()
#将新的URL发给工作节点
url_q.put(new_url)
print 'old_url=',url_manager.old_url_size()
print('old_url=',url_manager.old_url_size())
#加一个判断条件,当爬去2000个链接后就关闭,并保存进度
if(url_manager.old_url_size()>2000):
#通知爬行节点工作结束
url_q.put('end')
print '控制节点发起结束通知!'
print('控制节点发起结束通知!')
#关闭管理节点,同时存储set状态
url_manager.save_progress('new_urls.txt',url_manager.new_urls)
url_manager.save_progress('old_urls.txt',url_manager.old_urls)
return
#将从result_solve_proc获取到的urls添加到URL管理器之间
try:
if not conn_q.empty():
urls = conn_q.get()
url_manager.add_new_urls(urls)
except BaseException,e:
urls = conn_q.get()
url_manager.add_new_urls(urls)
except BaseException as e:
time.sleep(0.1)#延时休息


Expand All @@ -64,26 +62,28 @@ def result_solve_proc(self,result_q,conn_q,store_q):
while(True):
try:
if not result_q.empty():
#Queue.get(block=True, timeout=None)
content = result_q.get(True)
if content['new_urls']=='end':
#结果分析进程接受通知然后结束
print '结果分析进程接受通知然后结束!'
print('结果分析进程接受通知然后结束!')
store_q.put('end')
return
conn_q.put(content['new_urls'])#url为set类型
store_q.put(content['data'])#解析出来的数据为dict类型
else:
time.sleep(0.1)#延时休息
except BaseException,e:
except BaseException as e:
time.sleep(0.1)#延时休息


def store_proc(self,store_q):
output = DataOutput()
while True:
if not store_q.empty():
data = store_q.get()
if data=='end':
print '存储进程接受通知然后结束!'
print('存储进程接受通知然后结束!')
output.ouput_end(output.filepath)

return
Expand Down
14 changes: 7 additions & 7 deletions ch07/ControlNode/UrlManager.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#coding:utf-8
import cPickle
import pickle
import hashlib
class UrlManager(object):
def __init__(self):
Expand All @@ -19,7 +19,7 @@ def get_new_url(self):
'''
new_url = self.new_urls.pop()
m = hashlib.md5()
m.update(new_url)
m.update(new_url.encode("utf-8"))
self.old_urls.add(m.hexdigest()[8:-8])
return new_url

Expand All @@ -32,7 +32,7 @@ def add_new_url(self,url):
if url is None:
return
m = hashlib.md5()
m.update(url)
m.update(url.encode('utf-8'))
url_md5 = m.hexdigest()[8:-8]
if url not in self.new_urls and url_md5 not in self.old_urls:
self.new_urls.add(url)
Expand Down Expand Up @@ -70,19 +70,19 @@ def save_progress(self,path,data):
:return:
'''
with open(path, 'wb') as f:
cPickle.dump(data, f)
pickle.dump(data, f)

def load_progress(self,path):
'''
从本地文件加载进度
:param path:文件路径
:return:返回set集合
'''
print '[+] 从文件加载进度: %s' % path
print('[+] 从文件加载进度: %s' % path)
try:
with open(path, 'rb') as f:
tmp = cPickle.load(f)
tmp = pickle.load(f)
return tmp
except:
print '[!] 无进度文件, 创建: %s' % path
print('[!] 无进度文件, 创建: %s' % path)
return set()
Loading

0 comments on commit ef978d5

Please sign in to comment.