update code, version 2.9.3

tomzhang · Dec 19, 2017 · 2e02fe4 · 2e02fe4
1 parent 10184ec
commit 2e02fe4
Show file tree

Hide file tree

Showing 11 changed files with 28 additions and 58 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # PSpider
 
-A simple spider frame written by Python, which needs Python3.5+
+A simple web spider frame written by Python, which needs Python3.5+
 
 ### Features of PSpider
 1. Support multi-threading crawling mode (using threading and requests)

diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@
 
 setup(
     name="spider",
-    version="2.9.2",
+    version="2.9.3",
     author="xianhu",
     keywords=["spider", "crawler", "multi-threads", "distributed", "proxies"],
     packages=find_packages(exclude=("test.*",)),

diff --git a/spider/__init__.py b/spider/__init__.py
@@ -4,7 +4,7 @@
 define WebSpider, WebSpiderDist, and also define utilities and instances for web_spider
 """
 
-__version__ = "2.9.2"
+__version__ = "2.9.3"
 
 from .utilities import *
 from .instances import Fetcher, Parser, Saver, Proxieser

diff --git a/spider/concurrent/threads_inst/threads_inst_base.py b/spider/concurrent/threads_inst/threads_inst_base.py
@@ -19,19 +19,18 @@ class TPEnum(enum.Enum):
     TASKS_RUNNING = "tasks_running"         # flag of tasks_running
 
     URL_FETCH = "url_fetch"                 # flag of url_fetch
-    HTM_PARSE = "htm_parse"                 # flag of htm_parse
-    ITEM_SAVE = "item_save"                 # flag of item_save
-
     URL_NOT_FETCH = "url_not_fetch"         # flag of url_not_fetch
-    HTM_NOT_PARSE = "htm_not_parse"         # flag of htm_not_parse
-    ITEM_NOT_SAVE = "item_not_save"         # flag of item_not_save
-
     URL_FETCH_SUCC = "url_fetch_succ"       # flag of url_fetch_succ
-    HTM_PARSE_SUCC = "htm_parse_succ"       # flag of htm_parse_succ
-    ITEM_SAVE_SUCC = "item_save_succ"       # flag of item_save_succ
-
     URL_FETCH_FAIL = "url_fetch_fail"       # flag of url_fetch_fail
+
+    HTM_PARSE = "htm_parse"                 # flag of htm_parse
+    HTM_NOT_PARSE = "htm_not_parse"         # flag of htm_not_parse
+    HTM_PARSE_SUCC = "htm_parse_succ"       # flag of htm_parse_succ
     HTM_PARSE_FAIL = "htm_parse_fail"       # flag of htm_parse_fail
+
+    ITEM_SAVE = "item_save"                 # flag of item_save
+    ITEM_NOT_SAVE = "item_not_save"         # flag of item_not_save
+    ITEM_SAVE_SUCC = "item_save_succ"       # flag of item_save_succ
     ITEM_SAVE_FAIL = "item_save_fail"       # flag of item_save_fail
 
     PROXIES = "proxies"                     # flag of proxies
@@ -130,8 +129,7 @@ def work_monitor(self):
     self._last_save_num = cur_save_all
 
     if self._pool.get_proxies_flag():
-        info += " proxies:[LEFT=%d, FAIL=%d];" % \
-                (self._pool.get_number_dict(TPEnum.PROXIES_LEFT), self._pool.get_number_dict(TPEnum.PROXIES_FAIL))
+        info += " proxies:[LEFT=%d, FAIL=%d];" % (self._pool.get_number_dict(TPEnum.PROXIES_LEFT), self._pool.get_number_dict(TPEnum.PROXIES_FAIL))
 
     info += " total_seconds=%d" % (time.time() - self._init_time)
     logging.warning(info)

diff --git a/spider/concurrent/threads_inst/threads_inst_fetch.py b/spider/concurrent/threads_inst/threads_inst_fetch.py
@@ -43,7 +43,7 @@ def working(self):
         else:
             self._pool.update_number_dict(TPEnum.URL_FETCH_FAIL, +1)
 
-        if not proxies_state:
+        if (not proxies_state) and self._proxies:
             self._pool.update_number_dict(TPEnum.PROXIES_FAIL, +1)
             self._pool.finish_a_task(TPEnum.PROXIES)
             self._proxies = None

diff --git a/spider/concurrent/threads_pool.py b/spider/concurrent/threads_pool.py
@@ -37,15 +37,15 @@ def __init__(self, fetcher, parser, saver, proxieser=None, url_filter=None, moni
             TPEnum.TASKS_RUNNING: 0,                    # the count of tasks which are running
 
             TPEnum.URL_NOT_FETCH: 0,                    # the count of urls which haven't been fetched
-            TPEnum.HTM_NOT_PARSE: 0,                    # the count of urls which haven't been parsed
-            TPEnum.ITEM_NOT_SAVE: 0,                    # the count of urls which haven't been saved
-
             TPEnum.URL_FETCH_SUCC: 0,                   # the count of urls which have been fetched successfully
-            TPEnum.HTM_PARSE_SUCC: 0,                   # the count of urls which have been parsed successfully
-            TPEnum.ITEM_SAVE_SUCC: 0,                   # the count of urls which have been saved successfully
-
             TPEnum.URL_FETCH_FAIL: 0,                   # the count of urls which have been fetched failed
+
+            TPEnum.HTM_NOT_PARSE: 0,                    # the count of urls which haven't been parsed
+            TPEnum.HTM_PARSE_SUCC: 0,                   # the count of urls which have been parsed successfully
             TPEnum.HTM_PARSE_FAIL: 0,                   # the count of urls which have been parsed failed
+
+            TPEnum.ITEM_NOT_SAVE: 0,                    # the count of urls which haven't been saved
+            TPEnum.ITEM_SAVE_SUCC: 0,                   # the count of urls which have been saved successfully
             TPEnum.ITEM_SAVE_FAIL: 0,                   # the count of urls which have been saved failed
 
             TPEnum.PROXIES_LEFT: 0,                     # the count of proxies which are avaliable

diff --git a/spider/instances/inst_fetch.py b/spider/instances/inst_fetch.py
@@ -19,7 +19,7 @@ class of Fetcher, must include function working()
     def __init__(self, max_repeat=3, sleep_time=0):
         """
         constructor
-        :param max_repeat: default 3, maximum repeat fetching time for a url
+        :param max_repeat: default 3, maximum repeat count of fetching for a url
         :param sleep_time: default 0, sleeping time after a fetching for a url
         """
         self._max_repeat = max_repeat
@@ -31,7 +31,7 @@ def working(self, priority: int, url: str, keys: dict, deep: int, repeat: int, p
         working function, must "try, except" and don't change the parameters and return
         :return (fetch_result, proxies_state, content): fetch_result can be -2(fetch failed, stop thread), -1(fetch failed), 0(need repeat), 1(fetch success)
         :return (fetch_result, proxies_state, content): proxies_state can be True(avaiable), False(unavaiable)
-        :return (fetch_result, proxies_state, content): content can be any object, for example: string, list, etc
+        :return (fetch_result, proxies_state, content): content can be any object, for example string, list, etc
         """
         logging.debug("%s start: %s", self.__class__.__name__, CONFIG_FETCH_MESSAGE % (priority, keys, deep, repeat, url))
 

diff --git a/spider/instances/inst_proxies.py b/spider/instances/inst_proxies.py
@@ -26,7 +26,7 @@ def working(self) -> (int, list):
         """
         working function, must "try, except" and don't change the parameters and return
         :return (proxies_result, proxies_list): proxies_result can be -1(get failed), 1(get success)
-        :return (proxies_result, proxies_list): proxies list which getting from web or database
+        :return (proxies_result, proxies_list): proxies_list is a proxies list which getting from web or database
         """
         logging.debug("%s start", self.__class__.__name__)
 

diff --git a/spider/utilities/util_fetch.py b/spider/utilities/util_fetch.py
@@ -20,7 +20,7 @@ def extract_error_info():
     """
     _type, _value, _traceback = sys.exc_info()
     tb_list = traceback.extract_tb(_traceback)
-    error_info = "-->".join(["【filename=%s, line=%s, function=%s】" % (tb.filename, tb.lineno, tb.name) for tb in tb_list])
+    error_info = "-->".join(["[filename=%s, line=%s, function=%s]" % (tb.filename, tb.lineno, tb.name) for tb in tb_list])
     return "error_info=%s, error_type=%s, error=%s" % (error_info, _type, _value)
 
 

diff --git a/spider/utilities/util_parse.py b/spider/utilities/util_parse.py
@@ -15,41 +15,13 @@
 ]
 
 
-def get_string_num(string, base=None, only_num=True):
+def get_string_num(string):
     """
-    get a float number from a string, if base isn't None, K means (base * B), M means (base * K), ...
+    get a float number from a string
     """
     string_temp = get_string_strip(string.upper().replace(",", ""), replace_char="")
-    string_re = re.search(r"(?P<num>\d+(\.\d+)?)(?P<param>.*?)$", string_temp, flags=re.IGNORECASE)
-    if not string_re:
-        return 0.0
-    num, param = float(string_re.group("num")), string_re.group("param")
-    if only_num:
-        return num
-    if param.find("兆") >= 0:
-        num *= 10000000000000
-    if param.find("亿") >= 0:
-        num *= 100000000
-    if param.find("万") >= 0:
-        num *= 10000
-    if param.find("千") >= 0:
-        num *= 1000
-    if param.find("百") >= 0:
-        num *= 100
-    if param.find("十") >= 0:
-        num *= 10
-    if param.find("%") >= 0:
-        num /= 100
-    if base:
-        if param.find("K") >= 0:
-            num *= base
-        if param.find("M") >= 0:
-            num *= base * base
-        if param.find("G") >= 0:
-            num *= base * base * base
-        if param.find("T") >= 0:
-            num *= base * base * base * base
-    return num
+    string_re = re.search(r"(?P<num>\d+(\.\d+)?)", string_temp, flags=re.IGNORECASE)
+    return float(string_re.group("num")) if string_re else 0.0
 
 
 def get_string_strip(string, replace_char=" "):

diff --git a/spider/utilities/util_urlfilter.py b/spider/utilities/util_urlfilter.py
@@ -16,7 +16,7 @@ class of UrlFilter, to filter url by regexs and (bloomfilter or set)
 
     def __init__(self, black_patterns=(CONFIG_URL_PATTERN,), white_patterns=(r"^http",), capacity=None):
         """
-        constructor, use variable of BloomFilter if capacity else variable of set
+        constructor, use instance of BloomFilter if capacity else instance of set
         """
         self._re_black_list = [re.compile(pattern, flags=re.IGNORECASE) for pattern in black_patterns] if black_patterns else []
         self._re_white_list = [re.compile(pattern, flags=re.IGNORECASE) for pattern in white_patterns] if white_patterns else []