Skip to content

Commit

Permalink
make Parser and Saver can be Nullable, version1.4.0
Browse files Browse the repository at this point in the history
  • Loading branch information
xianhu committed Jun 27, 2018
1 parent 6b1ea03 commit ef242f4
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 21 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

setup(
name="spider",
version="1.3.0",
version="1.4.0",
author="xianhu",
keywords=["spider", "crawler", "multi-threads", "distributed", "proxies"],
packages=find_packages(exclude=("test.*",)),
Expand Down
2 changes: 1 addition & 1 deletion spider/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
define WebSpider, WebSpiderDist, and also define utilities and instances for web_spider
"""

__version__ = "1.3.0"
__version__ = "1.4.0"

from .utilities import *
from .instances import Fetcher, Parser, Saver, Proxieser
Expand Down
3 changes: 2 additions & 1 deletion spider/concurrent/threads_inst/threads_inst_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def working(self):
# ----3----
if fetch_result > 0:
self._pool.update_number_dict(TPEnum.URL_FETCH_SUCC, +1)
self._pool.add_a_task(TPEnum.HTM_PARSE, (priority, counter, url, keys, deep, content))
if content is not None:
self._pool.add_a_task(TPEnum.HTM_PARSE, (priority, counter, url, keys, deep, content))
elif fetch_result == 0:
self._pool.add_a_task(TPEnum.URL_FETCH, (priority+1, counter, url, keys, deep, repeat+1))
else:
Expand Down
39 changes: 23 additions & 16 deletions spider/concurrent/threads_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ class ThreadPool(object):
class of ThreadPool
"""

def __init__(self, fetcher, parser, saver, proxieser=None, url_filter=None, monitor_sleep_time=5):
def __init__(self, fetcher, parser=None, saver=None, proxieser=None, url_filter=None, monitor_sleep_time=5):
"""
constructor
"""
self._inst_fetcher = fetcher # fetcher instance, subclass of Fetcher
self._inst_parser = parser # parser instance, subclass of Parser
self._inst_saver = saver # saver instance, subclass of Saver
self._inst_parser = parser # parser instance, subclass of Parser or None
self._inst_saver = saver # saver instance, subclass of Saver or None
self._inst_proxieser = proxieser # default: None, proxieser instance, subclass of Proxieser

self._queue_fetch = queue.PriorityQueue() # (priority, counter, url, keys, deep, repeat)
Expand All @@ -33,7 +33,8 @@ def __init__(self, fetcher, parser, saver, proxieser=None, url_filter=None, moni

self._thread_proxieser = None # proxieser thread
self._thread_fetcher_list = [] # fetcher threads list
self._thread_parsar_list = [] # parser and saver threads list
self._thread_parser = None # parser thread
self._thread_saver = None # saver thread

self._thread_stop_flag = False # default: False, stop flag of threads
self._url_filter = url_filter # default: None, also can be UrlFilter()
Expand Down Expand Up @@ -82,19 +83,24 @@ def start_working(self, fetcher_num=10):

self._thread_proxieser = ProxiesThread("proxieser", self._inst_proxieser, self) if self._inst_proxieser else None
self._thread_fetcher_list = [FetchThread("fetcher-%d" % (i+1), copy.deepcopy(self._inst_fetcher), self) for i in range(fetcher_num)]
self._thread_parsar_list = [ParseThread("parser", self._inst_parser, self), SaveThread("saver", self._inst_saver, self)]
self._thread_parser = ParseThread("parser", self._inst_parser, self) if self._inst_parser else None
self._thread_saver = SaveThread("saver", self._inst_saver, self) if self._inst_saver else None

if self.get_proxies_flag():
if self._thread_proxieser:
self._thread_proxieser.setDaemon(True)
self._thread_proxieser.start()

for thread in self._thread_fetcher_list:
thread.setDaemon(True)
thread.start()

for thread in self._thread_parsar_list:
thread.setDaemon(True)
thread.start()
if self._thread_parser:
self._thread_parser.setDaemon(True)
self._thread_parser.start()

if self._thread_saver:
self._thread_saver.setDaemon(True)
self._thread_saver.start()

logging.info("%s start success", self.__class__.__name__)
return
Expand All @@ -110,15 +116,16 @@ def wait_for_finished(self):
if thread.is_alive():
thread.join()

for thread in self._thread_parsar_list:
if thread.is_alive():
thread.join()
if self._thread_parser and self._thread_parser.is_alive():
self._thread_parser.join()

if self._thread_saver and self._thread_saver.is_alive():
self._thread_saver.join()

if self.get_proxies_flag():
if self._thread_proxieser.is_alive():
self._thread_proxieser.join()
if self._thread_proxieser and self._thread_proxieser.is_alive():
self._thread_proxieser.join()

if self._monitor.is_alive():
if self._monitor and self._monitor.is_alive():
self._monitor.join()

logging.info("%s finished: %s", self.__class__.__name__, self._number_dict)
Expand Down
2 changes: 1 addition & 1 deletion spider/concurrent/threads_pool_dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class DistThreadPool(ThreadPool):
class of DistThreadPool, as the subclass of ThreadPool
"""

def __init__(self, fetcher, parser, saver, proxieser=None, url_filter=None, monitor_sleep_time=5):
def __init__(self, fetcher, parser=None, saver=None, proxieser=None, url_filter=None, monitor_sleep_time=5):
"""
constructor
"""
Expand Down
2 changes: 1 addition & 1 deletion spider/instances/inst_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def working(self, priority: int, url: str, keys: dict, deep: int, repeat: int, p
working function, must "try, except" and don't change the parameters and return
:return fetch_result: can be -1(fetch failed), 0(need repeat), 1(fetch success)
:return proxies_state: can be False(unavaiable), True(avaiable)
:return content: can be any object, for example string, list, etc
:return content: can be any object, for example string, list, None, etc
"""
logging.debug("%s start: %s", self.__class__.__name__, CONFIG_FETCH_MESSAGE % (priority, keys, deep, repeat, url))

Expand Down

0 comments on commit ef242f4

Please sign in to comment.