-
Notifications
You must be signed in to change notification settings - Fork 4
/
browser.py
133 lines (119 loc) · 5.22 KB
/
browser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/python
#
# Peteris Krumins ([email protected])
# http://www.catonmat.net -- good coders code, great reuse
#
# http://www.catonmat.net/blog/python-library-for-google-search/
#
# Code is licensed under MIT license.
#
import random
import socket
import urllib
import urllib2
import httplib
import os
TIMEOUT = 15 # socket timeout
BROWSERS = (
# Top most popular browsers in my access.log on 2009.02.12
# tail -50000 access.log |
# awk -F\" '{B[$6]++} END { for (b in B) { print B[b] ": " b } }' |
# sort -rn |
# head -20
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.0.6) Gecko/2009011912 Firefox/3.0.6',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6 (.NET CLR 3.5.30729)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.6) Gecko/2009020911 Ubuntu/8.10 (intrepid) Firefox/3.0.6',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6 (.NET CLR 3.5.30729)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.48 Safari/525.19',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.0.6) Gecko/2009020911 Ubuntu/8.10 (intrepid) Firefox/3.0.6',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.5) Gecko/2008121621 Ubuntu/8.04 (hardy) Firefox/3.0.5',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
)
class BrowserError(Exception):
def __init__(self, url, error):
self.url = url
self.error = error
class PoolHTTPConnection(httplib.HTTPConnection):
def connect(self):
"""Connect to the host and port specified in __init__."""
msg = "getaddrinfo returns an empty list"
for res in socket.getaddrinfo(self.host, self.port, 0,
socket.SOCK_STREAM):
af, socktype, proto, canonname, sa = res
try:
self.sock = socket.socket(af, socktype, proto)
if self.debuglevel > 0:
print "connect: (%s, %s)" % (self.host, self.port)
self.sock.settimeout(TIMEOUT)
self.sock.connect(sa)
except socket.error, msg:
if self.debuglevel > 0:
print 'connect fail:', (self.host, self.port)
if self.sock:
self.sock.close()
self.sock = None
continue
break
if not self.sock:
raise socket.error, msg
class PoolHTTPHandler(urllib2.HTTPHandler):
def http_open(self, req):
return self.do_open(PoolHTTPConnection, req)
class Browser(object):
def __init__(self, user_agent=BROWSERS[0], debug=False, use_pool=False):
self.headers = {
'User-Agent': user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-us,en;q=0.5'
}
self.debug = debug
def get_page(self, url, data=None, http_proxy=None):
'''http_proxy(ip:port)'''
handlers = [PoolHTTPHandler]
if http_proxy:handlers.append(urllib2.ProxyHandler({'http': http_proxy}))
opener = urllib2.build_opener(*handlers)
if data: data = urllib.urlencode(data)
request = urllib2.Request(url, data, self.headers)
try:
response = opener.open(request)
return response.read()
except (urllib2.HTTPError, urllib2.URLError), e:
raise BrowserError(url, str(e))
except (socket.error, socket.sslerror), msg:
raise BrowserError(url, msg)
except socket.timeout, e:
raise BrowserError(url, "timeout")
except KeyboardInterrupt:
raise
except:
raise BrowserError(url, "unknown error")
def set_random_user_agent(self):
self.headers['User-Agent'] = random.choice(BROWSERS) #
return self.headers['User-Agent']
########custom logic#####################################
b = None
def download(url,proxy=None):
global b
if not b:
b = Browser()
b.set_random_user_agent()
return b.get_page(url,None,proxy)
def downad_and_save(url,lfile,proxy=None):
if os.path.exists(lfile):
return lfile
content = download(url,proxy)
with open(lfile,'w') as f:
f.write(content)
f.close()
if __name__ == "__main__":
#return 'http://www.baidu.com/s?%s' % urllib.urlencode({'wd': kword})
with open(r'/tmp/test.html','w+') as f:
html = download('http://www.soso.com/q?pid=s.idx&cid=s.idx.se&w=%B1%B1%BE%A9+%C8%D5%D7%E2%B7%BF')
f.write(html)
f.close()