Skip to content

Commit

Permalink
update code, version 2.9.3
Browse files Browse the repository at this point in the history
  • Loading branch information
xianhu committed Dec 19, 2017
1 parent 10184ec commit 2e02fe4
Show file tree
Hide file tree
Showing 11 changed files with 28 additions and 58 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# PSpider

A simple spider frame written by Python, which needs Python3.5+
A simple web spider frame written by Python, which needs Python3.5+

### Features of PSpider
1. Support multi-threading crawling mode (using threading and requests)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

setup(
name="spider",
version="2.9.2",
version="2.9.3",
author="xianhu",
keywords=["spider", "crawler", "multi-threads", "distributed", "proxies"],
packages=find_packages(exclude=("test.*",)),
Expand Down
2 changes: 1 addition & 1 deletion spider/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
define WebSpider, WebSpiderDist, and also define utilities and instances for web_spider
"""

__version__ = "2.9.2"
__version__ = "2.9.3"

from .utilities import *
from .instances import Fetcher, Parser, Saver, Proxieser
Expand Down
20 changes: 9 additions & 11 deletions spider/concurrent/threads_inst/threads_inst_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,18 @@ class TPEnum(enum.Enum):
TASKS_RUNNING = "tasks_running" # flag of tasks_running

URL_FETCH = "url_fetch" # flag of url_fetch
HTM_PARSE = "htm_parse" # flag of htm_parse
ITEM_SAVE = "item_save" # flag of item_save

URL_NOT_FETCH = "url_not_fetch" # flag of url_not_fetch
HTM_NOT_PARSE = "htm_not_parse" # flag of htm_not_parse
ITEM_NOT_SAVE = "item_not_save" # flag of item_not_save

URL_FETCH_SUCC = "url_fetch_succ" # flag of url_fetch_succ
HTM_PARSE_SUCC = "htm_parse_succ" # flag of htm_parse_succ
ITEM_SAVE_SUCC = "item_save_succ" # flag of item_save_succ

URL_FETCH_FAIL = "url_fetch_fail" # flag of url_fetch_fail

HTM_PARSE = "htm_parse" # flag of htm_parse
HTM_NOT_PARSE = "htm_not_parse" # flag of htm_not_parse
HTM_PARSE_SUCC = "htm_parse_succ" # flag of htm_parse_succ
HTM_PARSE_FAIL = "htm_parse_fail" # flag of htm_parse_fail

ITEM_SAVE = "item_save" # flag of item_save
ITEM_NOT_SAVE = "item_not_save" # flag of item_not_save
ITEM_SAVE_SUCC = "item_save_succ" # flag of item_save_succ
ITEM_SAVE_FAIL = "item_save_fail" # flag of item_save_fail

PROXIES = "proxies" # flag of proxies
Expand Down Expand Up @@ -130,8 +129,7 @@ def work_monitor(self):
self._last_save_num = cur_save_all

if self._pool.get_proxies_flag():
info += " proxies:[LEFT=%d, FAIL=%d];" % \
(self._pool.get_number_dict(TPEnum.PROXIES_LEFT), self._pool.get_number_dict(TPEnum.PROXIES_FAIL))
info += " proxies:[LEFT=%d, FAIL=%d];" % (self._pool.get_number_dict(TPEnum.PROXIES_LEFT), self._pool.get_number_dict(TPEnum.PROXIES_FAIL))

info += " total_seconds=%d" % (time.time() - self._init_time)
logging.warning(info)
Expand Down
2 changes: 1 addition & 1 deletion spider/concurrent/threads_inst/threads_inst_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def working(self):
else:
self._pool.update_number_dict(TPEnum.URL_FETCH_FAIL, +1)

if not proxies_state:
if (not proxies_state) and self._proxies:
self._pool.update_number_dict(TPEnum.PROXIES_FAIL, +1)
self._pool.finish_a_task(TPEnum.PROXIES)
self._proxies = None
Expand Down
12 changes: 6 additions & 6 deletions spider/concurrent/threads_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,15 @@ def __init__(self, fetcher, parser, saver, proxieser=None, url_filter=None, moni
TPEnum.TASKS_RUNNING: 0, # the count of tasks which are running

TPEnum.URL_NOT_FETCH: 0, # the count of urls which haven't been fetched
TPEnum.HTM_NOT_PARSE: 0, # the count of urls which haven't been parsed
TPEnum.ITEM_NOT_SAVE: 0, # the count of urls which haven't been saved

TPEnum.URL_FETCH_SUCC: 0, # the count of urls which have been fetched successfully
TPEnum.HTM_PARSE_SUCC: 0, # the count of urls which have been parsed successfully
TPEnum.ITEM_SAVE_SUCC: 0, # the count of urls which have been saved successfully

TPEnum.URL_FETCH_FAIL: 0, # the count of urls which have been fetched failed

TPEnum.HTM_NOT_PARSE: 0, # the count of urls which haven't been parsed
TPEnum.HTM_PARSE_SUCC: 0, # the count of urls which have been parsed successfully
TPEnum.HTM_PARSE_FAIL: 0, # the count of urls which have been parsed failed

TPEnum.ITEM_NOT_SAVE: 0, # the count of urls which haven't been saved
TPEnum.ITEM_SAVE_SUCC: 0, # the count of urls which have been saved successfully
TPEnum.ITEM_SAVE_FAIL: 0, # the count of urls which have been saved failed

TPEnum.PROXIES_LEFT: 0, # the count of proxies which are avaliable
Expand Down
4 changes: 2 additions & 2 deletions spider/instances/inst_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class of Fetcher, must include function working()
def __init__(self, max_repeat=3, sleep_time=0):
"""
constructor
:param max_repeat: default 3, maximum repeat fetching time for a url
:param max_repeat: default 3, maximum repeat count of fetching for a url
:param sleep_time: default 0, sleeping time after a fetching for a url
"""
self._max_repeat = max_repeat
Expand All @@ -31,7 +31,7 @@ def working(self, priority: int, url: str, keys: dict, deep: int, repeat: int, p
working function, must "try, except" and don't change the parameters and return
:return (fetch_result, proxies_state, content): fetch_result can be -2(fetch failed, stop thread), -1(fetch failed), 0(need repeat), 1(fetch success)
:return (fetch_result, proxies_state, content): proxies_state can be True(avaiable), False(unavaiable)
:return (fetch_result, proxies_state, content): content can be any object, for example: string, list, etc
:return (fetch_result, proxies_state, content): content can be any object, for example string, list, etc
"""
logging.debug("%s start: %s", self.__class__.__name__, CONFIG_FETCH_MESSAGE % (priority, keys, deep, repeat, url))

Expand Down
2 changes: 1 addition & 1 deletion spider/instances/inst_proxies.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def working(self) -> (int, list):
"""
working function, must "try, except" and don't change the parameters and return
:return (proxies_result, proxies_list): proxies_result can be -1(get failed), 1(get success)
:return (proxies_result, proxies_list): proxies list which getting from web or database
:return (proxies_result, proxies_list): proxies_list is a proxies list which getting from web or database
"""
logging.debug("%s start", self.__class__.__name__)

Expand Down
2 changes: 1 addition & 1 deletion spider/utilities/util_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def extract_error_info():
"""
_type, _value, _traceback = sys.exc_info()
tb_list = traceback.extract_tb(_traceback)
error_info = "-->".join(["filename=%s, line=%s, function=%s" % (tb.filename, tb.lineno, tb.name) for tb in tb_list])
error_info = "-->".join(["[filename=%s, line=%s, function=%s]" % (tb.filename, tb.lineno, tb.name) for tb in tb_list])
return "error_info=%s, error_type=%s, error=%s" % (error_info, _type, _value)


Expand Down
36 changes: 4 additions & 32 deletions spider/utilities/util_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,41 +15,13 @@
]


def get_string_num(string, base=None, only_num=True):
def get_string_num(string):
"""
get a float number from a string, if base isn't None, K means (base * B), M means (base * K), ...
get a float number from a string
"""
string_temp = get_string_strip(string.upper().replace(",", ""), replace_char="")
string_re = re.search(r"(?P<num>\d+(\.\d+)?)(?P<param>.*?)$", string_temp, flags=re.IGNORECASE)
if not string_re:
return 0.0
num, param = float(string_re.group("num")), string_re.group("param")
if only_num:
return num
if param.find("兆") >= 0:
num *= 10000000000000
if param.find("亿") >= 0:
num *= 100000000
if param.find("万") >= 0:
num *= 10000
if param.find("千") >= 0:
num *= 1000
if param.find("百") >= 0:
num *= 100
if param.find("十") >= 0:
num *= 10
if param.find("%") >= 0:
num /= 100
if base:
if param.find("K") >= 0:
num *= base
if param.find("M") >= 0:
num *= base * base
if param.find("G") >= 0:
num *= base * base * base
if param.find("T") >= 0:
num *= base * base * base * base
return num
string_re = re.search(r"(?P<num>\d+(\.\d+)?)", string_temp, flags=re.IGNORECASE)
return float(string_re.group("num")) if string_re else 0.0


def get_string_strip(string, replace_char=" "):
Expand Down
2 changes: 1 addition & 1 deletion spider/utilities/util_urlfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class of UrlFilter, to filter url by regexs and (bloomfilter or set)

def __init__(self, black_patterns=(CONFIG_URL_PATTERN,), white_patterns=(r"^http",), capacity=None):
"""
constructor, use variable of BloomFilter if capacity else variable of set
constructor, use instance of BloomFilter if capacity else instance of set
"""
self._re_black_list = [re.compile(pattern, flags=re.IGNORECASE) for pattern in black_patterns] if black_patterns else []
self._re_white_list = [re.compile(pattern, flags=re.IGNORECASE) for pattern in white_patterns] if white_patterns else []
Expand Down

0 comments on commit 2e02fe4

Please sign in to comment.