Skip to content

Commit

Permalink
update spider code: UrlFilter, and update code in demos_doubanmovies,…
Browse files Browse the repository at this point in the history
… version 1.6.2
  • Loading branch information
xianhu committed Nov 30, 2016
1 parent 34c3d44 commit 9a8aa23
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 28 deletions.
34 changes: 17 additions & 17 deletions demos_doubanmovies/movie_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,41 +31,41 @@ def htm_parse(self, priority, url, keys, deep, critical, parse_repeat, content):
content_left = soup.find("div", class_="subject clearfix")

nbg_soup = content_left.find("a", class_="nbgnbg").find("img")
movie.append(nbg_soup.get("src") if nbg_soup else None)
movie.append(nbg_soup.get("src") if nbg_soup else "")

info = content_left.find("div", id="info").get_text()
info_dict = dict([line.strip().split(":", 1) for line in info.strip().split("\n") if line.strip().find(":") > 0])

movie.append(info_dict.get("导演"))
movie.append(info_dict.get("编剧"))
movie.append(info_dict.get("主演"))
movie.append(info_dict.get("导演", "").replace("\t", " "))
movie.append(info_dict.get("编剧", "").replace("\t", " "))
movie.append(info_dict.get("主演", "").replace("\t", " "))

movie.append(info_dict.get("类型"))
movie.append(info_dict.get("制片国家/地区"))
movie.append(info_dict.get("语言"))
movie.append(info_dict.get("类型", "").replace("\t", " "))
movie.append(info_dict.get("制片国家/地区", "").replace("\t", " "))
movie.append(info_dict.get("语言", "").replace("\t", " "))

movie.append(info_dict.get("上映日期") if "上映日期" in info_dict else info_dict.get("首播"))
movie.append(info_dict.get("季数"))
movie.append(info_dict.get("集数"))
movie.append(info_dict.get("片长") if "片长" in info_dict else info_dict.get("单集片长"))
movie.append(info_dict.get("上映日期", "").replace("\t", " ") if "上映日期" in info_dict else info_dict.get("首播", "").replace("\t", " "))
movie.append(info_dict.get("季数", "").replace("\t", " "))
movie.append(info_dict.get("集数", "").replace("\t", " "))
movie.append(info_dict.get("片长", "").replace("\t", " ") if "片长" in info_dict else info_dict.get("单集片长", "").replace("\t", " "))

movie.append(info_dict.get("又名"))
movie.append(info_dict.get("官方网站"))
movie.append(info_dict.get("官方小站"))
movie.append(info_dict.get("IMDb链接"))
movie.append(info_dict.get("又名", "").replace("\t", " "))
movie.append(info_dict.get("官方网站", "").replace("\t", " "))
movie.append(info_dict.get("官方小站", "").replace("\t", " "))
movie.append(info_dict.get("IMDb链接", "").replace("\t", " "))

# 右边
content_right = soup.find("div", class_="rating_wrap clearbox")
if content_right:
movie.append(content_right.find("strong", class_="ll rating_num").get_text())

rating_people = content_right.find("a", class_="rating_people")
movie.append(rating_people.find("span").get_text() if rating_people else None)
movie.append(rating_people.find("span").get_text() if rating_people else "")

rating_per_list = [item.get_text() for item in content_right.find_all("span", class_="rating_per")]
movie.append(", ".join(rating_per_list))
else:
movie.extend([None, None, None])
movie.extend(["", "", ""])

assert len(movie) == 21, "length of movie is invalid"
save_list.append(movie)
Expand Down
14 changes: 13 additions & 1 deletion demos_doubanmovies/movie_saver.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,22 @@
# _*_ coding: utf-8 _*_

import spider
import pymysql


class MovieSaver(spider.Saver):

def __init__(self):
spider.Saver.__init__(self)
self.conn = pymysql.connect(host="59.110.49.40", user="root", password="mimaMIMA123456", db="db_my", charset="utf8")
self.cursor = self.conn.cursor()
self.conn.autocommit(1)
return

def item_save(self, url, keys, item):
self.save_pipe.write("\t".join([i if i else "" for i in item]) + "\n")
self.cursor.execute("insert into t_doubanmovies (m_url, m_name, m_year, m_imgurl, m_director, m_writer, m_actors, "
"m_genre, m_country, m_language, m_release, m_season, m_jishu, m_length, m_alias, m_website, m_dbsite, "
"m_imdb, m_score, m_comment, m_starpercent)"
" VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);",
[i.strip() if i is not None else "" for i in item])
return True
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

setup(
name="spider",
version="1.6.1",
version="1.6.2",
author="xianhu",
keywords=["spider", "crawler"],
packages=find_packages(exclude=("test", "test.*", "demos_*")),
Expand Down
17 changes: 12 additions & 5 deletions spider/utilities/util_urlfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,18 @@ def __init__(self, black_patterns=(CONFIG_URLPATTERN_ALL,), white_patterns=("^ht
self.bloom_filter = pybloom_live.ScalableBloomFilter(capacity, error_rate=0.001) if capacity else None
return

def update(self):
def update(self, url_list):
"""
update this urlfilter, you can rewrite this function if necessary
update this urlfilter using url_list
"""
assert False, "you must rewrite update function in %s" % self.__class__.__name__
if self.url_set is not None:
self.url_set.update(url_list)
elif self.bloom_filter is not None:
for url in url_list:
self.bloom_filter.add(url)
else:
pass
return

def check(self, url):
"""
Expand All @@ -45,10 +52,10 @@ def check(self, url):
for re_white in self.re_white_list:
if re_white.search(url):
if self.url_set is not None:
result = False if url in self.url_set else True
result = (not (url in self.url_set))
self.url_set.add(url)
elif self.bloom_filter is not None:
# bloom filter, "add": if key already exists, return True, else return False
# "add": if key already exists, return True, else return False
result = (not self.bloom_filter.add(url))
else:
pass
Expand Down
1 change: 1 addition & 0 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def test_spider():
black_patterns = (spider.CONFIG_URLPATTERN_FILES, r"binding", r"download", )
white_patterns = ("^http[s]{0,1}://(www\.){0,1}(wandoujia|(zhushou\.360))\.(com|cn)", )
url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=1000)
url_filter.update([])

# 初始化WebSpider
web_spider = spider.WebSpider(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5)
Expand Down
21 changes: 17 additions & 4 deletions test_demos.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import re
import spider
import pymysql
import logging
import requests
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -42,16 +43,28 @@ def get_douban_movies():
soup = BeautifulSoup(resp.text, "html5lib")
a_list = soup.find_all("a", href=re.compile(r"^/tag/", flags=re.IGNORECASE))
all_urls.update([(a_soup.get_text(), "https://movie.douban.com" + a_soup.get("href")) for a_soup in a_list])
logging.warning("all urls: %s", len(all_urls))

# 查询已有数据
conn = pymysql.connect(host="59.110.49.40", user="root", password="mimaMIMA123456", db="db_my", charset="utf8")
cursor = conn.cursor()
cursor.execute("select m_url from t_doubanmovies;")

bloomfilter = spider.UrlFilter()
bloomfilter.update([item[0] for item in cursor.fetchall()])
logging.warning("update bloomfilter success: %s", cursor.rowcount)

cursor.close()
conn.close()

# 构造爬虫
dou_spider = spider.WebSpider(MovieFetcher(), MovieParser(max_deep=-1, max_repeat=1), MovieSaver(open("doubanmovie.txt", "w")), spider.UrlFilter())
# dou_spider.set_start_url("https://movie.douban.com/tag/新海诚", ("index", "test"), priority=0, critical=False)
dou_spider = spider.WebSpider(MovieFetcher(), MovieParser(max_deep=-1, max_repeat=1), MovieSaver(), bloomfilter)
for tag, url in all_urls:
dou_spider.set_start_url(url, ("index", tag), priority=1, critical=True)
pass
dou_spider.start_work_and_wait_done(fetcher_num=20)
return

if __name__ == '__main__':

if __name__ == "__main__":
logging.basicConfig(level=logging.WARNING, format="%(asctime)s\t%(levelname)s\t%(message)s")
get_douban_movies()

0 comments on commit 9a8aa23

Please sign in to comment.