Skip to content

Commit

Permalink
update code, version 1.1.7
Browse files Browse the repository at this point in the history
  • Loading branch information
xianhu committed Mar 28, 2018
1 parent 37b8005 commit df5ed1f
Show file tree
Hide file tree
Showing 9 changed files with 18 additions and 18 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

setup(
name="spider",
version="1.1.6",
version="1.1.7",
author="xianhu",
keywords=["spider", "crawler", "multi-threads", "distributed", "proxies"],
packages=find_packages(exclude=("test.*",)),
Expand Down
2 changes: 1 addition & 1 deletion spider/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
define WebSpider, WebSpiderDist, and also define utilities and instances for web_spider
"""

__version__ = "1.1.6"
__version__ = "1.1.7"

from .utilities import *
from .instances import Fetcher, Parser, Saver, Proxieser
Expand Down
5 changes: 3 additions & 2 deletions spider/concurrent/threads_inst/threads_inst_proxies.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ def working(self):
proxies_result, proxies_list = self._worker.working()

# ----3----
for proxies in proxies_list:
self._pool.add_a_task(TPEnum.PROXIES, proxies)
if proxies_result > 0:
for proxies in proxies_list:
self._pool.add_a_task(TPEnum.PROXIES, proxies)

# ----*----
while (not self._pool.is_all_tasks_done()) and (self._pool.get_number_dict(TPEnum.PROXIES_LEFT) > 100):
Expand Down
12 changes: 6 additions & 6 deletions spider/concurrent/threads_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def start_working(self, fetcher_num=10):
"""
start this thread pool
"""
logging.info("%s start: urls_count=%s, fetcher_num=%s", self.__class__.__name__, self.get_number_dict(TPEnum.URL_FETCH_NOT), fetcher_num)
logging.info("%s start working: urls_count=%s, fetcher_num=%s", self.__class__.__name__, self.get_number_dict(TPEnum.URL_FETCH_NOT), fetcher_num)

self._thread_proxieser = ProxiesThread("proxieser", self._inst_proxieser, self) if self._inst_proxieser else None
self._thread_fetcher_list = [FetchThread("fetcher-%d" % (i+1), copy.deepcopy(self._inst_fetcher), self) for i in range(fetcher_num)]
Expand Down Expand Up @@ -126,19 +126,19 @@ def wait_for_finished(self):
# ================================================================================================================================
def get_proxies_flag(self):
"""
get the proxies flag of this pool
get the proxies flag of this thread pool
"""
return True if self._inst_proxieser else False

def get_thread_stop_flag(self):
"""
get the stop flag of threads
get the threads' stop flag of this thread pool
"""
return self._thread_stop_flag

def get_current_state(self):
"""
get current state of this pool
get current state of this thread pool
"""
return self._number_dict

Expand Down Expand Up @@ -186,7 +186,7 @@ def add_a_task(self, task_name, task_content):

def get_a_task(self, task_name):
"""
get a task based on task_name, if queue is empty, raise queue.Empty, also for proxies
get a task based on task_name, also for proxies
"""
task_content = None
if task_name == TPEnum.PROXIES:
Expand All @@ -207,7 +207,7 @@ def get_a_task(self, task_name):

def finish_a_task(self, task_name):
"""
finish a task based on task_name, call queue.task_done(), also for proxies
finish a task based on task_name, also for proxies
"""
if task_name == TPEnum.PROXIES:
self._queue_proxies.task_done()
Expand Down
4 changes: 2 additions & 2 deletions spider/concurrent/threads_pool_dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def add_a_task(self, task_name, task_content):

def get_a_task(self, task_name):
"""
get a task based on task_name, if queue is empty, raise queue.Empty, also for proxies
get a task based on task_name, also for proxies
"""
task_content = None
if task_name == TPEnum.PROXIES:
Expand All @@ -79,7 +79,7 @@ def get_a_task(self, task_name):

def finish_a_task(self, task_name):
"""
finish a task based on task_name, call queue.task_done(), also for proxies
finish a task based on task_name, also for proxies
"""
if task_name == TPEnum.PROXIES:
self._queue_proxies.task_done()
Expand Down
2 changes: 1 addition & 1 deletion spider/instances/inst_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def working(self, priority: int, url: str, keys: dict, deep: int, repeat: int, p
"""
working function, must "try, except" and don't change the parameters and return
:return fetch_result: can be -1(fetch failed), 0(need repeat), 1(fetch success)
:return proxies_state: can be False(unavaiable), True(avaiable), default True
:return proxies_state: can be False(unavaiable), True(avaiable)
:return content: can be any object, for example string, list, etc
"""
logging.debug("%s start: %s", self.__class__.__name__, CONFIG_FETCH_MESSAGE % (priority, keys, deep, repeat, url))
Expand Down
5 changes: 2 additions & 3 deletions spider/instances/inst_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import re
import logging
import datetime
from ..utilities import CONFIG_PARSE_MESSAGE, get_url_legal
from ..utilities import CONFIG_PARSE_MESSAGE


class Parser(object):
Expand Down Expand Up @@ -49,8 +49,7 @@ def htm_parse(self, priority: int, url: str, keys: dict, deep: int, content: obj

url_list = []
if (self._max_deep < 0) or (deep < self._max_deep):
tmp_list = re.findall(r"<a.+?href=\"(?P<url>.{5,}?)\".*?>", html_text, flags=re.IGNORECASE)
url_list = [(_url, keys, priority+1) for _url in [get_url_legal(href, url) for href in tmp_list]]
url_list = [(_url, keys, priority+1) for _url in re.findall(r"<a.+?href=\"(?P<url>.{5,}?)\".*?>", html_text, flags=re.IGNORECASE)]

title = re.search(r"<title>(?P<title>.+?)</title>", html_text, flags=re.IGNORECASE)
save_list = [(url, title.group("title").strip(), datetime.datetime.now()), ] if title else []
Expand Down
2 changes: 1 addition & 1 deletion spider/instances/inst_proxies.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(self, sleep_time=10):
def working(self) -> (int, list):
"""
working function, must "try, except" and don't change the parameters and return
:return proxies_result: can be -1(get proxies failed), 1(get proxies success)
:return proxies_result: can be -1(get failed), 1(get success)
:return proxies_list: can be a proxies list which getting from web or database
"""
logging.debug("%s start", self.__class__.__name__)
Expand Down
2 changes: 1 addition & 1 deletion spider/utilities/util_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def get_string_num(string, ignore_sign=False):
get a float number from a string
"""
string_re = re.search(r"(?P<sign>-?)(?P<num>\d+(\.\d+)?)", get_string_strip(string.replace(",", ""), replace_char=""), flags=re.IGNORECASE)
return float((string_re.group("sign") if not ignore_sign else "") + string_re.group("num")) if string_re else 0.0
return float((string_re.group("sign") if not ignore_sign else "") + string_re.group("num")) if string_re else None


def get_string_strip(string, replace_char=" "):
Expand Down

0 comments on commit df5ed1f

Please sign in to comment.