Skip to content

Commit

Permalink
update code, version 2.6.0
Browse files Browse the repository at this point in the history
  • Loading branch information
xianhu committed Apr 18, 2021
1 parent 518c22f commit eb7ad17
Show file tree
Hide file tree
Showing 8 changed files with 18 additions and 26 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,18 @@ A simple web spider frame written by Python, which needs Python3.5+

### Modules of PSpider
1. utilities module: define some utilities functions and classes for multi-threading spider
2. instances module: define classes of fetcher, parser, saver for multi-threading spider
2. instances module: define classes of Fetcher, Parser, Saver for multi-threading spider
3. concurrent module: define WebSpiderFrame of multi-threading spider

### Procedure of PSpider
![](procedure.png)
①: Fetchers get url from UrlQueue, and makes requests based on this url
①: Fetchers get url from UrlQueue, and make requests based on this url
②: Put the result of ① to HtmlQueue, and so Parser can get it
③: Parser gets item from HtmlQueue, and parses it to get new urls and items which need save
③: Parser gets item from HtmlQueue, and parses it to get new urls and items
④: Put the new urls to UrlQueue, and so Fetcher can get it
⑤: Put the items to ItemQueue, and so Saver can get it
⑥: Saver gets item from ItemQueue, and saves it to filesystem or database
⑦: Proxieser gets proxies from web or database and puts proxies to ProxiesQueue
⑦: Proxieser gets proxies from web or database, and puts proxies to ProxiesQueue
⑧: Fetcher gets proxies from ProxiesQueue if needed, and makes requests based on this proxies

### Tutorials of PSpider
Expand Down
6 changes: 3 additions & 3 deletions spider/concurrent/threads_inst/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def working(self):


# ===============================================================================================================================
def init_monitor_thread(self, name, pool):
def init_monitor(self, name, pool):
"""
constructor of MonitorThread
"""
Expand All @@ -88,7 +88,7 @@ def init_monitor_thread(self, name, pool):

def work_monitor(self):
"""
monitor of the thread pool, auto running and return False if you need stop thread
procedure of MonitorThread, auto running and return False if you need stop thread
"""
time.sleep(5)

Expand Down Expand Up @@ -128,4 +128,4 @@ def work_monitor(self):
return not self._pool.is_ready_to_finish()


MonitorThread = type("MonitorThread", (BaseThread, ), dict(__init__=init_monitor_thread, working=work_monitor))
MonitorThread = type("MonitorThread", (BaseThread, ), dict(__init__=init_monitor, working=work_monitor))
2 changes: 1 addition & 1 deletion spider/concurrent/threads_inst/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class of FetchThread, as the subclass of BaseThread

def __init__(self, name, worker, pool):
"""
constructor
constructor, add proxies to this thread
"""
BaseThread.__init__(self, name, worker, pool)
self._proxies = None
Expand Down
4 changes: 2 additions & 2 deletions spider/concurrent/threads_inst/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@ class of ParseThread, as the subclass of BaseThread

def __init__(self, name, worker, pool):
"""
constructor
constructor, add pool_mp to this thread
"""
BaseThread.__init__(self, name, worker, pool)
self._pool_mp = multiprocessing.Pool()
return

def __del__(self):
"""
terminate self._pool_mp
destructor, terminate self._pool_mp
"""
self._pool_mp.terminate()
return
Expand Down
2 changes: 1 addition & 1 deletion spider/concurrent/threads_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class of ThreadPool

def __init__(self, fetcher, parser=None, saver=None, proxieser=None, url_filter=None, queue_parse_size=-1, queue_save_size=-1, queue_proxies_size=-1):
"""
constructor, queue_*_size is the maximum size of each queue, -1 to no limition
constructor, queue_parse_size/queue_save_size/queue_proxies_size are the maximum size of each queue, -1 to no limition
"""
self._inst_fetcher = fetcher # fetcher instance, subclass of Fetcher
self._inst_parser = parser # parser instance, subclass of Parser or None
Expand Down
4 changes: 2 additions & 2 deletions spider/utilities/util_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
"CONFIG_HEADERS_SET",
]

# define structure of error message for threads_inst
# define structure of error message for threads_inst/
CONFIG_ERROR_MESSAGE = "priority=%s, keys=%s, deep=%s, url=%s"
CONFIG_ERROR_MESSAGE_RE = re.compile(r"priority=(?P<p>\d+),\s*keys=(?P<k>.+?),\s*deep=(?P<d>\d+),\s*url=(?P<u>.+)$", flags=re.IGNORECASE)

# define the regex for urls
# define the regex for legal urls and illegal urls
CONFIG_URL_LEGAL_RE = re.compile(r"^https?:[^\s]+?\.[^\s]+?", flags=re.IGNORECASE)
CONFIG_URL_ILLEGAL_RE = re.compile(r"\.(cab|iso|zip|rar|tar|gz|bz2|7z|tgz|apk|exe|app|pkg|bmg|rpm|deb|dmg|jar|jad|bin|msi|"
"pdf|doc|docx|xls|xlsx|ppt|pptx|txt|md|odf|odt|rtf|py|java|c|cc|js|css|log|csv|tsv|"
Expand Down
2 changes: 1 addition & 1 deletion spider/utilities/util_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def check_url_legal(url):

def get_url_legal(url, base_url, encoding=None):
"""
get a legal url from a url, based on base_url
get a legal url from a url string, based on base_url
"""
return urllib.parse.urljoin(base_url, urllib.parse.quote(url, safe="%/:=&?~#+!$,;'@()*[]|", encoding=encoding))

Expand Down
16 changes: 4 additions & 12 deletions spider/utilities/util_urlfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,11 @@ class of UrlFilter, to filter url by regexs and set

def __init__(self, black_patterns=(CONFIG_URL_ILLEGAL_RE,), white_patterns=(CONFIG_URL_LEGAL_RE,)):
"""
constructor, use the instance of BloomFilter if capacity else the instance of set
constructor
"""
self._urlfilter = set()
self._re_black_list = [item_re for item_re in black_patterns]
self._re_white_list = [item_re for item_re in white_patterns]
return

def update(self, url_list):
"""
update this urlfilter using a url_list
"""
for url in filter(lambda x: CONFIG_URL_ILLEGAL_RE.match(x), url_list):
self._urlfilter.add(url)
self._re_black_list = black_patterns
self._re_white_list = white_patterns
return

def check(self, url):
Expand All @@ -45,7 +37,7 @@ def check(self, url):

def check_and_add(self, url):
"""
check whether url is in this urlfilter, and add url to this urlfilter
check whether url is in this urlfilter, and add url to it
"""
result = False
if self.check(url):
Expand Down

0 comments on commit eb7ad17

Please sign in to comment.