-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 98b9d53
Showing
75 changed files
with
4,595 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
IP_Source = { | ||
'66ip': 'http://www.66ip.cn/mo.php?sxb=%BD%AD%CB%D5&tqsl=100&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=' | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
import sys | ||
sys.path.append('/home/user/crawl') | ||
from spider_tool import * | ||
from ip_source import * | ||
import requests | ||
import time | ||
from db_tool import * | ||
|
||
|
||
TEST_URL = '://ip.cip.cc' | ||
IPPool_DB = 'IPPool' | ||
|
||
def test_proxies_efficience(proxy, method='http'): | ||
# if the proxy is un usable or time out > 5 seconds, will return a negative value to mark it as faile | ||
if method not in ['http', 'https']: | ||
g_log.error("method %s in correct." % method) | ||
return -1 | ||
g_log.debug('Now test the efficiency of %s, with method %s' % (proxy, method)) | ||
proxies = {method: proxy} | ||
start_time = time.time() | ||
try: | ||
response = requests.get(method+TEST_URL, proxies=proxies, timeout=5) | ||
cost = time.time() - start_time | ||
g_log.debug(response.text) | ||
g_log.debug('proxy %s costs %s' % (proxy, str(cost))) | ||
if cost < 5: | ||
return cost | ||
return -1 | ||
except Exception as e: | ||
g_log.info('proxy unusable') | ||
return -1 | ||
|
||
def update_up_pool(cur): | ||
g_log.debug('first check the avaliable proxies') | ||
cur.execute('select ip_addr, port, method, id from tb_ip_pool where is_active=True') | ||
count = cur.fetchall() | ||
g_log.debug('Now have active proxies %d' % len(count)) | ||
update_proxies(cur, count) | ||
if count == None or len(count) < 30: | ||
get_new_proxies(cur) | ||
|
||
def update_proxies(cur, proxies): | ||
g_log.debug('Now check avaliability of current proxies') | ||
for items in proxies: | ||
proxy = items[0]+':'+items[1] | ||
method = items[2] | ||
ret = test_proxies_efficience(proxy, method) | ||
if ret<0: | ||
g_log.warn(proxy+' is not working, deactivate it!') | ||
cur.execute('update tb_ip_pool set is_active=False where id=%d' % items[3]) | ||
cur.connection.commit() | ||
return | ||
|
||
def get_new_proxies(cur): | ||
html = open_url(IP_Source['66ip']) | ||
if html == None: | ||
return | ||
bsObj = BeautifulSoup(html, 'html.parser') | ||
ip_ports = bsObj.findAll('br') | ||
|
||
ip_info = [] | ||
for item in ip_ports: | ||
temp_info = item.next_sibling.strip() | ||
if len(temp_info)> 10 and temp_info not in ip_info: | ||
ip_info.append(temp_info) | ||
g_log.debug(len(ip_info)) | ||
# get the existed ip/port to reduce the connection number | ||
cur.execute('select ip_addr, port from tb_ip_pool') | ||
existed = cur.fetchall() | ||
for proxy in ip_info: | ||
temp = proxy.split(':') | ||
if (temp[0], temp[1]) in existed: | ||
continue | ||
ret_http = test_proxies_efficience(proxy) | ||
if ret_http < 0: | ||
ret_https = test_proxies_efficience(proxy, 'https') | ||
if ret_https < 0: | ||
continue | ||
else: | ||
cur.execute('replace into tb_ip_pool (ip_addr, port, method, is_active, latency) values (\"%s\", \"%s\", \"%s\", True, \"%f\")' % (temp[0], temp[1], 'https', ret_https)) | ||
cur.connection.commit() | ||
else: | ||
cur.execute('replace into tb_ip_pool (ip_addr, port, method, is_active, latency) values (\"%s\", \"%s\", \"%s\", True, \"%f\")' % (temp[0], temp[1], 'http', ret_http)) | ||
cur.connection.commit() | ||
|
||
|
||
|
||
def main(): | ||
try: | ||
conn = create_new_connect(db = IPPool_DB) | ||
if conn == None: | ||
return | ||
cur = create_new_cursor(conn) | ||
if cur == None: | ||
return | ||
update_up_pool(cur) | ||
except Exception as e: | ||
g_log.error(e) | ||
finally: | ||
close_cursor(cur) | ||
close_conn(conn) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# crawl_spider | ||
|
||
complete the crawl taobao page with phantomjs |
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
|
||
from urllib.request import urlopen | ||
from bs4 import BeautifulSoup | ||
from log_ctl import g_log | ||
|
||
def open_url(URL): | ||
''' | ||
[Description]: open the requested url and return the pure html info | ||
''' | ||
try: | ||
g_log.debug('Connecting the %s' % URL) | ||
html = urlopen(URL) | ||
return html | ||
except Exception as e: | ||
g_log.error(e) | ||
return None | ||
|
||
def filter_rule(Dict): | ||
return 'href' in Dict and Dict['href'].startswith('/wiki/') | ||
|
||
def get_links(domain, URI): | ||
''' | ||
[Description]: open the uri with domain and get all the links in it | ||
''' | ||
global pages | ||
html = open_url(domain+URI) | ||
if html == None: | ||
return | ||
bsObj = BeautifulSoup(html, 'html.parser') | ||
try: | ||
print(bsObj.h1.get_text()) | ||
print(bsObj.find(id="mw-content-text").findAll("p")[0]) | ||
print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href']) | ||
except AttributeError: | ||
print('Some attribute lost, continue') | ||
for link in bsObj.findAll(lambda tag: filter_rule(tag.attrs)): | ||
if link.attrs['href'] not in pages: | ||
newPage = link.attrs['href'] | ||
print("--------------------------\n" + newPage) | ||
pages.add(newPage) | ||
get_links(domain, newPage) | ||
|
||
|
||
pages = set() | ||
Dom = 'http://en.wikipedia.org' | ||
|
||
get_links(Dom, '') | ||
|
||
''' | ||
html = open_url(url) | ||
if html != None: | ||
bsObj = BeautifulSoup(html, 'html.parser') | ||
for link in bsObj.find("div", {"id": "bodyContent"}).findAll(lambda tag: filter_rule(tag.attrs)): | ||
if 'href' in link.attrs: | ||
print(link.attrs['href']) | ||
''' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from log_ctl import g_log | ||
import pymysql | ||
|
||
def create_new_connect(host='127.0.0.1', unix_socket='/var/run/mysqld/mysqld.sock', user='root', passwd='trend#11', db='mysql'): | ||
g_log.debug('now try to connect the db with %s, %s, %s, %s, %s' % (host, unix_socket, user, passwd, db)) | ||
try: | ||
conn = pymysql.connect(host=host, unix_socket=unix_socket, user=user, passwd=passwd, db=db, use_unicode=True, charset="utf8") | ||
return conn | ||
except Exception as e: | ||
g_log.error(e) | ||
return None | ||
|
||
def create_new_cursor(conn): | ||
g_log.debug('Now try to create a new cur.') | ||
try: | ||
cur = conn.cursor() | ||
return cur | ||
except Exception as e: | ||
g_log.error(e) | ||
return None | ||
|
||
def close_cursor(cursor): | ||
g_log.debug("now release cursor") | ||
cursor.close() | ||
|
||
def close_conn(conn): | ||
g_log.debug("now release connection") | ||
conn.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#!/usr/bin/python | ||
|
||
try: | ||
from logging.handlers import RotatingFileHandler | ||
import logging.config | ||
|
||
except ImportError as e: | ||
raise ImportError ('[%s] a module is missing.' % str[e]) | ||
|
||
|
||
logging.config.fileConfig("/home/user/crawl/logging.conf") | ||
g_log = logging.getLogger() | ||
|
||
|
||
# Init Log utility | ||
def initLogger(log_file): | ||
''' Init Log utility ''' | ||
|
||
# mkdir | ||
os.system('mkdir -p %s' % os.path.dirname(log_file)) | ||
_Formatter='%(asctime)s [%(process)d:%(thread)x] %(levelname)-8s[%(filename)s:%(lineno)d(%(funcName)s)] %(message)s' | ||
handler = RotatingFileHandler(filename=log_file, maxBytes=LOG_MAX_SIZE, backupCount=BACKUP_NUM) | ||
formatter = logging.Formatter(_Formatter) | ||
handler.setFormatter(formatter) | ||
loger = logging.getLogger() | ||
for hdlr in loger.handlers: | ||
hdlr.close() | ||
loger.removeHandler(hdlr) | ||
loger.setLevel(DEF_LOG_LEVEL) | ||
loger.addHandler(handler) | ||
|
||
# set log level according to g_conf | ||
def setLogLevel(str_loglevel) : | ||
''' set log level ''' | ||
|
||
if str_loglevel == 'debug' : | ||
g_log.setLevel(logging.DEBUG) | ||
elif str_loglevel == 'info' : | ||
g_log.setLevel(logging.INFO) | ||
elif str_loglevel == 'warning' : | ||
g_log.setLevel(logging.WARN) | ||
else : | ||
g_log.setLevel(logging.ERROR) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
[loggers] | ||
keys=root,simpleExample | ||
|
||
[handlers] | ||
keys=consoleHandler, fileHandler | ||
|
||
[formatters] | ||
keys=simpleFormatter | ||
|
||
[logger_root] | ||
level=INFO | ||
handlers=fileHandler | ||
|
||
[logger_simpleExample] | ||
level=DEBUG | ||
handlers=consoleHandler | ||
qualname=simpleExample | ||
propagate=0 | ||
|
||
[handler_consoleHandler] | ||
class=StreamHandler | ||
level=DEBUG | ||
formatter=simpleFormatter | ||
args=(sys.stdout,) | ||
|
||
[handler_fileHandler] | ||
class=FileHandler | ||
level=DEBUG | ||
formatter=simpleFormatter | ||
args=('/home/user/crawl/spider.log','a') | ||
|
||
[formatter_simpleFormatter] | ||
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s | ||
datefmt= | ||
|
Oops, something went wrong.