-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcsdnBot.py
124 lines (92 loc) · 3.25 KB
/
csdnBot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#encoding=utf-8
import re
import urllib2
import time
import random
import codecs
class CSDN(object):
basic_url = 'http://blog.csdn.net/wds2006sdo/article/list/%d'
time_min = 30
time_max = 40
def __init__(self):
self.blogs = self.get_pages()
self.add_eachblog_random_range()
self.random_range = self.blogs[-1][2]
for blog in self.blogs:
print blog[0], ' ' ,blog[1],' ',blog[2]
def access_by_url(self,url):
try:
timeout = 5
request = urllib2.Request(url)
#伪装HTTP请求
request.add_header('User-agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')
request.add_header('connection','keep-alive')
request.add_header('referer', url)
response = urllib2.urlopen(request, timeout = timeout)
html = response.read()
response.close()
return html
except Exception as e:
print 'URL Request Error:', e
return None
def get_pages(self):
suffix = ''
blog_list = []
page = 1
lastpage = 2
while page <= lastpage:
url = self.basic_url % page
html = self.access_by_url(url)
thispage_blogs = self.extract_blogs(html)
blog_list.extend(thispage_blogs)
try:
lastpage = int(re.findall('<a href=.*?(\d)">尾页',html)[0])
except:
break
page += 1
return blog_list
def extract_blogs(self,html):
blogs = []
blog_strings = re.findall('<span class="link_view" title="阅读次数">.*</span>',html)
for blog_string in blog_strings:
url = 'http://blog.csdn.net/wds2006sdo/article/details/' + re.findall('wds2006sdo/article/details/(\d+)',blog_string)[0]
read_count = re.findall('\((\d+)\)',blog_string)[0]
blogs.append([url,int(read_count)])
return blogs
def write_file(self,content):
file_object = codecs.open("index.html",'w')
file_object.write(str(content))
file_object.close()
def add_eachblog_random_range(self):
sum = 0
for blog in self.blogs:
sum += blog[1]
mean = int(float(sum) / float(len(self.blogs)))
order_num = mean
end = 0
for blog in self.blogs:
blog.append(end+mean+blog[1]+order_num)
end += mean+blog[1]+order_num
order_num = int(order_num ** 0.5)
def select_page(self):
num = random.randint(1,self.random_range)
print 'num ',num
i = 0
while i < len(self.blogs) and num > self.blogs[i][2]:
i += 1
return self.blogs[i][0]
def sleep(self):
sleep_time = random.randint(1,60)
print ' sleep ',sleep_time,' second'
time.sleep(sleep_time)
def robot_start(self):
robot_size = random.randint(self.time_min,self.time_max)
print robot_size
for i in range(robot_size):
url = self.select_page()
print url
self.access_by_url(url)
self.sleep()
if __name__ == '__main__':
csdn = CSDN()
csdn.robot_start()