forked from jincheng/pyspider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
firstdemo.py
166 lines (136 loc) · 3.9 KB
/
firstdemo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# -*- coding:gb2312 -*-
from sgmllib import SGMLParser
import urllib2,re
from urllib import urlretrieve
import threading
import time
import shutil
import os
class URLLister(SGMLParser):
"""
urls:chapter url
"""
match = ''
dicurl = {}
def start_a(self, attrs):
for k, v in attrs:
if k == 'href' and re.match(self.match,v):
self.dicurl[v]=True
class IMGLister(SGMLParser):
search = ''
dicimg = {}
def start_img(self, attrs):
for k, v in attrs:
if k == 'src' and re.search(self.search,v):
self.dicimg[v] = True
def spider(baseurl,dicurl,filepath):
g_mutex = threading.Lock()
url = ''
for k in dicurl:
g_mutex.acquire()
if dicurl[k]:
dicurl[k]=False
url = baseurl + k
break
g_mutex.release()
if url is not '':
content = urllib2.urlopen(url).read()
res = re.split('/',url)
lenth = len(res)
filename = res[lenth-1]
filepath += filename+'.html'
try:
fw = open(filepath,'w')
fw.write(content)
print filepath + '文件输出成功'
fw.close()
except IOError, e:
print e
time.sleep(1)
def spiderimg(baseurl,dicimg,filepath):
g_mutex = threading.Lock()
url = ''
for k in dicimg:
g_mutex.acquire()
if dicimg[k]:
dicimg[k]=False
url = baseurl + k
break
g_mutex.release()
if url is not '':
downfile(url,filepath)
time.sleep(1)
def downfile(netpath,localpath):
filenamerule = re.compile(r'(?<=\btarget\b=)(.*\..*)$')
filenameres = re.search(filenamerule, netpath)
filename = filenameres.group(0)
try:
urlretrieve(netpath,localpath + filename)
print localpath + filename + '保存成功'
except IOError, e:
print e
#begin
url_base = 'http://wiki.woodpecker.org.cn'
print '打开网页...'+url_base+'/moin/WxPythonInAction'
content = urllib2.urlopen(url_base+'/moin/WxPythonInAction').read()
print '开始查找href...'
lister=URLLister()
lister.match = '/moin/WxPythonInAction/Chapter'
lister.feed(content)
listimg = IMGLister()
listimg.match = ''
global dicurl
global dicimg
dicurl = lister.dicurl
global g_mutex
threadpool = []
print '文件保存地址(such as d:\docs)'
filepath = raw_input()
'''
filepathrule = re.compile(r'\\$')
res = re.search(filepathrule,filepath)
if res.group(0):
filepath += '\\'
print filepath
'''
try:
os.makedirs(filepath)
print '文件夹不存在,已创建'
except:
print '文件夹存在,继续执行'
for k in lister.dicurl:
th = threading.Thread(target = spider, args = (url_base,dicurl,filepath))
threadpool.append(th)
for th in threadpool:
th.start()
for th in threadpool:
threading.Thread.join(th)
print '文件下载完成,开始下载图片'
folder = 'images\\'
os.makedirs(filepath + folder)
for k in dicurl:
url = url_base + k
content = urllib2.urlopen(url).read()
imglister = IMGLister()
imglister.search = r'/moin/WxPythonInAction/\bChapter\w+\b\?action=AttachFile\&do=get\&target=(.*\..*)$'
imglister.feed(content)
dicimg = imglister.dicimg
'''
folderrule = re.compile(r'\bChapter\w+\b')
for val in dicimg:
folderres = re.search(folderrule, val)
folder = folderres.group(0)
folder += '\\'
break
if not os.path.exists(filepath + folder):
os.makedirs(filepath + folder)
'''
threadpool2 = []
for val in imglister.dicimg:
th = threading.Thread(target = spiderimg, args = (url_base, dicimg, filepath + folder))
threadpool2.append(th)
for th in threadpool2:
th.start()
for th in threadpool2:
threading.Thread.join(th)
print k