Skip to content

Commit

Permalink
update code, change log level, version 1.0.1
Browse files Browse the repository at this point in the history
  • Loading branch information
xianhu committed Jan 10, 2018
1 parent 2015e62 commit fee74d7
Show file tree
Hide file tree
Showing 7 changed files with 18 additions and 15 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

setup(
name="spider",
version="1.0.0",
version="1.0.1",
author="xianhu",
keywords=["spider", "crawler", "multi-threads", "distributed", "proxies"],
packages=find_packages(exclude=("test.*",)),
Expand Down
2 changes: 1 addition & 1 deletion spider/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
define WebSpider, WebSpiderDist, and also define utilities and instances for web_spider
"""

__version__ = "1.0.0"
__version__ = "1.0.1"

from .utilities import *
from .instances import Fetcher, Parser, Saver, Proxieser
Expand Down
3 changes: 1 addition & 2 deletions spider/concurrent/threads_inst/threads_inst_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ def working(self):
def init_monitor_thread(self, name, pool, sleep_time=5):
"""
constructor of MonitorThread
:param sleep_time: sleeping time in every loop
"""
BaseThread.__init__(self, name, None, pool)

Expand Down Expand Up @@ -130,7 +129,7 @@ def work_monitor(self):
info += " proxies:[LEFT=%d, FAIL=%d];" % (self._pool.get_number_dict(TPEnum.PROXIES_LEFT), self._pool.get_number_dict(TPEnum.PROXIES_FAIL))

info += " total_seconds=%d" % (time.time() - self._init_time)
logging.warning(info)
logging.info(info)
return self._pool.get_monitor_flag()


Expand Down
5 changes: 3 additions & 2 deletions spider/concurrent/threads_inst/threads_inst_proxies.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ def working(self):
proxies_result, proxies_list = self._worker.working()

# ----3----
for proxies in proxies_list:
self._pool.add_a_task(TPEnum.PROXIES, proxies)
if proxies_result > 0:
for proxies in proxies_list:
self._pool.add_a_task(TPEnum.PROXIES, proxies)

# ----5----
while (self._pool.get_number_dict(TPEnum.PROXIES_LEFT) > 100) and (not self._pool.is_all_tasks_done()):
Expand Down
11 changes: 7 additions & 4 deletions spider/concurrent/threads_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,17 @@ def start_work_and_wait_done(self, fetcher_num=10, is_over=True):
"""
start this pool, and wait for finishing
"""
logging.warning("%s start: urls_count=%s, fetcher_num=%s, is_over=%s", self.__class__.__name__, self.get_number_dict(TPEnum.URL_FETCH_NOT), fetcher_num, is_over)
logging.info("%s start: urls_count=%s, fetcher_num=%s, is_over=%s", self.__class__.__name__, self.get_number_dict(TPEnum.URL_FETCH_NOT), fetcher_num, is_over)

# proxies thread
proxies_thread = ProxiesThread("proxieser", self._proxieser, self) if self._proxieser else None

# fetcher/parser/saver thread list
fetcher_list = [FetchThread("fetcher-%d" % (i+1), copy.deepcopy(self._inst_fetcher), self) for i in range(fetcher_num)]
parser_saver_list = [ParseThread("parser", self._inst_parser, self), SaveThread("saver", self._inst_saver, self)]
parser_saver_list = [
ParseThread("parser", self._inst_parser, self),
SaveThread("saver", self._inst_saver, self)
]

# ----1----
if proxies_thread:
Expand All @@ -104,7 +107,7 @@ def start_work_and_wait_done(self, fetcher_num=10, is_over=True):

# clear the variables if all fetcher stoped
while self.get_number_dict(TPEnum.URL_FETCH_NOT) > 0:
priority, counter, url, keys, deep, repeat = self.get_a_task(TPEnum.URL_FETCH)
priority, _, url, keys, deep, repeat = self.get_a_task(TPEnum.URL_FETCH)
logging.error("%s error: not fetch, %s", self._inst_fetcher.__class__.__name__, CONFIG_FETCH_MESSAGE % (priority, keys, deep, repeat, url))
self.update_number_dict(TPEnum.URL_FETCH_FAIL, +1)
self.finish_a_task(TPEnum.URL_FETCH)
Expand All @@ -123,7 +126,7 @@ def start_work_and_wait_done(self, fetcher_num=10, is_over=True):
self._monitor_flag = False
self._monitor.join()

logging.warning("%s end: %s", self.__class__.__name__, self._number_dict)
logging.info("%s end: %s", self.__class__.__name__, self._number_dict)
return self._number_dict

# ================================================================================================================================
Expand Down
2 changes: 1 addition & 1 deletion spider/concurrent/threads_pool_dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def add_a_task(self, task_name, task_content):
if task_name == TPEnum.PROXIES:
self._proxies_queue.put_nowait(task_content)
self.update_number_dict(TPEnum.PROXIES_LEFT, +1)
elif task_name == TPEnum.URL_FETCH and ((task_content[-1] > 0) or (not self._url_filter) or self._url_filter.check(task_content[1])):
elif task_name == TPEnum.URL_FETCH and ((task_content[-1] > 0) or (not self._url_filter) or self._url_filter.check_and_add(task_content[2])):
self._redis_client.lpush(self._key_high_priority if task_content[0] < 100 else self._key_low_priority, task_content)
self.update_number_dict(TPEnum.URL_FETCH_COUNT, +1)
elif task_name == TPEnum.HTM_PARSE:
Expand Down
8 changes: 4 additions & 4 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ def test_spider():
"""
test spider
"""
# initial fetcher / parser / saver, you also can rewrite this three class
# initial fetcher / parser / saver, you also can rewrite this three classes
fetcher = spider.Fetcher(max_repeat=1, sleep_time=0)
parser = spider.Parser(max_deep=2)
saver = spider.Saver(save_pipe=open("out_spider_thread.txt", "w"))
saver = spider.Saver(save_pipe=open("out_thread.txt", "w"))

# define url_filter
url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=None)
Expand All @@ -38,7 +38,7 @@ def test_spider_distributed():
"""
test distributed spider
"""
# initial fetcher / parser / saver, you also can rewrite this three class
# initial fetcher / parser / saver, you also can rewrite this three classes
fetcher = spider.Fetcher(max_repeat=1, sleep_time=0)
parser = spider.Parser(max_deep=-1)
saver = spider.Saver(save_pipe=open("out_spider_distributed.txt", "w"))
Expand All @@ -56,7 +56,7 @@ def test_spider_distributed():


if __name__ == "__main__":
logging.basicConfig(level=logging.WARNING, format="%(asctime)s\t%(levelname)s\t%(message)s")
logging.basicConfig(level=logging.INFO, format="%(asctime)s\t%(levelname)s\t%(message)s")
test_spider()
# test_spider_distributed()
exit()

0 comments on commit fee74d7

Please sign in to comment.