Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
FlyLevin committed Feb 13, 2018
0 parents commit 98b9d53
Show file tree
Hide file tree
Showing 75 changed files with 4,595 additions and 0 deletions.
Empty file added IPPool/__init__.py
Empty file.
Binary file added IPPool/__pycache__/ip_source.cpython-35.pyc
Binary file not shown.
3 changes: 3 additions & 0 deletions IPPool/ip_source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
IP_Source = {
'66ip': 'http://www.66ip.cn/mo.php?sxb=%BD%AD%CB%D5&tqsl=100&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea='
}
105 changes: 105 additions & 0 deletions IPPool/update_ip_pool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import sys
sys.path.append('/home/user/crawl')
from spider_tool import *
from ip_source import *
import requests
import time
from db_tool import *


TEST_URL = '://ip.cip.cc'
IPPool_DB = 'IPPool'

def test_proxies_efficience(proxy, method='http'):
# if the proxy is un usable or time out > 5 seconds, will return a negative value to mark it as faile
if method not in ['http', 'https']:
g_log.error("method %s in correct." % method)
return -1
g_log.debug('Now test the efficiency of %s, with method %s' % (proxy, method))
proxies = {method: proxy}
start_time = time.time()
try:
response = requests.get(method+TEST_URL, proxies=proxies, timeout=5)
cost = time.time() - start_time
g_log.debug(response.text)
g_log.debug('proxy %s costs %s' % (proxy, str(cost)))
if cost < 5:
return cost
return -1
except Exception as e:
g_log.info('proxy unusable')
return -1

def update_up_pool(cur):
g_log.debug('first check the avaliable proxies')
cur.execute('select ip_addr, port, method, id from tb_ip_pool where is_active=True')
count = cur.fetchall()
g_log.debug('Now have active proxies %d' % len(count))
update_proxies(cur, count)
if count == None or len(count) < 30:
get_new_proxies(cur)

def update_proxies(cur, proxies):
g_log.debug('Now check avaliability of current proxies')
for items in proxies:
proxy = items[0]+':'+items[1]
method = items[2]
ret = test_proxies_efficience(proxy, method)
if ret<0:
g_log.warn(proxy+' is not working, deactivate it!')
cur.execute('update tb_ip_pool set is_active=False where id=%d' % items[3])
cur.connection.commit()
return

def get_new_proxies(cur):
html = open_url(IP_Source['66ip'])
if html == None:
return
bsObj = BeautifulSoup(html, 'html.parser')
ip_ports = bsObj.findAll('br')

ip_info = []
for item in ip_ports:
temp_info = item.next_sibling.strip()
if len(temp_info)> 10 and temp_info not in ip_info:
ip_info.append(temp_info)
g_log.debug(len(ip_info))
# get the existed ip/port to reduce the connection number
cur.execute('select ip_addr, port from tb_ip_pool')
existed = cur.fetchall()
for proxy in ip_info:
temp = proxy.split(':')
if (temp[0], temp[1]) in existed:
continue
ret_http = test_proxies_efficience(proxy)
if ret_http < 0:
ret_https = test_proxies_efficience(proxy, 'https')
if ret_https < 0:
continue
else:
cur.execute('replace into tb_ip_pool (ip_addr, port, method, is_active, latency) values (\"%s\", \"%s\", \"%s\", True, \"%f\")' % (temp[0], temp[1], 'https', ret_https))
cur.connection.commit()
else:
cur.execute('replace into tb_ip_pool (ip_addr, port, method, is_active, latency) values (\"%s\", \"%s\", \"%s\", True, \"%f\")' % (temp[0], temp[1], 'http', ret_http))
cur.connection.commit()



def main():
try:
conn = create_new_connect(db = IPPool_DB)
if conn == None:
return
cur = create_new_cursor(conn)
if cur == None:
return
update_up_pool(cur)
except Exception as e:
g_log.error(e)
finally:
close_cursor(cur)
close_conn(conn)


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# crawl_spider

complete the crawl taobao page with phantomjs
Empty file added __init__.py
Empty file.
Binary file added __pycache__/crawl.cpython-35.pyc
Binary file not shown.
Binary file added __pycache__/db_tool.cpython-35.pyc
Binary file not shown.
Binary file added __pycache__/log_ctl.cpython-35.pyc
Binary file not shown.
Binary file added __pycache__/spider_tool.cpython-35.pyc
Binary file not shown.
58 changes: 58 additions & 0 deletions crawl_0.1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from urllib.request import urlopen
from bs4 import BeautifulSoup
from log_ctl import g_log

def open_url(URL):
'''
[Description]: open the requested url and return the pure html info
'''
try:
g_log.debug('Connecting the %s' % URL)
html = urlopen(URL)
return html
except Exception as e:
g_log.error(e)
return None

def filter_rule(Dict):
return 'href' in Dict and Dict['href'].startswith('/wiki/')

def get_links(domain, URI):
'''
[Description]: open the uri with domain and get all the links in it
'''
global pages
html = open_url(domain+URI)
if html == None:
return
bsObj = BeautifulSoup(html, 'html.parser')
try:
print(bsObj.h1.get_text())
print(bsObj.find(id="mw-content-text").findAll("p")[0])
print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href'])
except AttributeError:
print('Some attribute lost, continue')
for link in bsObj.findAll(lambda tag: filter_rule(tag.attrs)):
if link.attrs['href'] not in pages:
newPage = link.attrs['href']
print("--------------------------\n" + newPage)
pages.add(newPage)
get_links(domain, newPage)


pages = set()
Dom = 'http://en.wikipedia.org'

get_links(Dom, '')

'''
html = open_url(url)
if html != None:
bsObj = BeautifulSoup(html, 'html.parser')
for link in bsObj.find("div", {"id": "bodyContent"}).findAll(lambda tag: filter_rule(tag.attrs)):
if 'href' in link.attrs:
print(link.attrs['href'])
'''
28 changes: 28 additions & 0 deletions db_tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from log_ctl import g_log
import pymysql

def create_new_connect(host='127.0.0.1', unix_socket='/var/run/mysqld/mysqld.sock', user='root', passwd='trend#11', db='mysql'):
g_log.debug('now try to connect the db with %s, %s, %s, %s, %s' % (host, unix_socket, user, passwd, db))
try:
conn = pymysql.connect(host=host, unix_socket=unix_socket, user=user, passwd=passwd, db=db, use_unicode=True, charset="utf8")
return conn
except Exception as e:
g_log.error(e)
return None

def create_new_cursor(conn):
g_log.debug('Now try to create a new cur.')
try:
cur = conn.cursor()
return cur
except Exception as e:
g_log.error(e)
return None

def close_cursor(cursor):
g_log.debug("now release cursor")
cursor.close()

def close_conn(conn):
g_log.debug("now release connection")
conn.close()
43 changes: 43 additions & 0 deletions log_ctl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/python

try:
from logging.handlers import RotatingFileHandler
import logging.config

except ImportError as e:
raise ImportError ('[%s] a module is missing.' % str[e])


logging.config.fileConfig("/home/user/crawl/logging.conf")
g_log = logging.getLogger()


# Init Log utility
def initLogger(log_file):
''' Init Log utility '''

# mkdir
os.system('mkdir -p %s' % os.path.dirname(log_file))
_Formatter='%(asctime)s [%(process)d:%(thread)x] %(levelname)-8s[%(filename)s:%(lineno)d(%(funcName)s)] %(message)s'
handler = RotatingFileHandler(filename=log_file, maxBytes=LOG_MAX_SIZE, backupCount=BACKUP_NUM)
formatter = logging.Formatter(_Formatter)
handler.setFormatter(formatter)
loger = logging.getLogger()
for hdlr in loger.handlers:
hdlr.close()
loger.removeHandler(hdlr)
loger.setLevel(DEF_LOG_LEVEL)
loger.addHandler(handler)

# set log level according to g_conf
def setLogLevel(str_loglevel) :
''' set log level '''

if str_loglevel == 'debug' :
g_log.setLevel(logging.DEBUG)
elif str_loglevel == 'info' :
g_log.setLevel(logging.INFO)
elif str_loglevel == 'warning' :
g_log.setLevel(logging.WARN)
else :
g_log.setLevel(logging.ERROR)
35 changes: 35 additions & 0 deletions logging.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
[loggers]
keys=root,simpleExample

[handlers]
keys=consoleHandler, fileHandler

[formatters]
keys=simpleFormatter

[logger_root]
level=INFO
handlers=fileHandler

[logger_simpleExample]
level=DEBUG
handlers=consoleHandler
qualname=simpleExample
propagate=0

[handler_consoleHandler]
class=StreamHandler
level=DEBUG
formatter=simpleFormatter
args=(sys.stdout,)

[handler_fileHandler]
class=FileHandler
level=DEBUG
formatter=simpleFormatter
args=('/home/user/crawl/spider.log','a')

[formatter_simpleFormatter]
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
datefmt=

Loading

0 comments on commit 98b9d53

Please sign in to comment.