forked from LiuRoy/zhihu_spider
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request LiuRoy#1 from LiuRoy/release/0.0.1
爬去知乎用户数据
- Loading branch information
Showing
11 changed files
with
531 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
.idea/* | ||
*.pyc | ||
images | ||
images/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Scrapy==1.0.3 | ||
pymongo==3.2.2 | ||
requests==2.7.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# -*- coding=utf8 -*- | ||
from scrapy import cmdline | ||
|
||
cmdline.execute("scrapy crawl zhihu".split()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Automatically created by: scrapy startproject | ||
# | ||
# For more information about the [deploy] section see: | ||
# https://scrapyd.readthedocs.org/en/latest/deploy.html | ||
|
||
[settings] | ||
default = zhihu.settings | ||
|
||
[deploy] | ||
#url = http://localhost:6800/ | ||
project = zhihu |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# -*- coding=utf8 -*- | ||
""" | ||
常量定义 | ||
""" | ||
from zhihu.settings import USER_AGENT | ||
|
||
class Gender(object): | ||
""" | ||
性别定义 | ||
""" | ||
MALE = 1 | ||
FEMALE = 2 | ||
|
||
|
||
class People(object): | ||
""" | ||
人员类型 | ||
""" | ||
Followee = 1 | ||
Follower = 2 | ||
|
||
|
||
HEADER = { | ||
'Host': 'www.zhihu.com', | ||
'Connection': 'keep-alive', | ||
'Pragma': 'no-cache', | ||
'Cache-Control': 'no-cache', | ||
'Accept': '*/*', | ||
'Origin': 'https://www.zhihu.com', | ||
'X-Requested-With': 'XMLHttpRequest', | ||
'User-Agent': USER_AGENT, | ||
'Content-Type': 'application/x-www-form-urlencoded', | ||
'Accept-Encoding': 'gzip, deflate', | ||
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4', | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Define here the models for your scraped items | ||
# | ||
# See documentation in: | ||
# http://doc.scrapy.org/en/latest/topics/items.html | ||
|
||
from scrapy import Item, Field | ||
|
||
|
||
class ZhihuPeopleItem(Item): | ||
"""知乎用户属性 | ||
Attributes: | ||
nickname 用户名 | ||
zhihu_id 用户id | ||
location 位置 | ||
business 行业 | ||
gender 性别 | ||
employment 公司 | ||
position 职位 | ||
education 教育情况 | ||
image_url 头像图片url | ||
""" | ||
nickname = Field() | ||
zhihu_id = Field() | ||
location = Field() | ||
business = Field() | ||
gender = Field() | ||
employment = Field() | ||
position = Field() | ||
education = Field() | ||
agree_count = Field() | ||
thanks_count = Field() | ||
followee_count = Field() | ||
follower_count = Field() | ||
image_url = Field() | ||
|
||
|
||
class ZhihuRelationItem(Item): | ||
"""知乎用户关系 | ||
Attributes: | ||
zhihu_id 知乎id | ||
user_list 用户列表 | ||
user_type 用户类型(1关注的人 2粉丝) | ||
""" | ||
zhihu_id = Field() | ||
user_list = Field() | ||
user_type = Field() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Define your item pipelines here | ||
# | ||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting | ||
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html | ||
import os | ||
|
||
import requests | ||
from pymongo import MongoClient | ||
from scrapy import log | ||
|
||
from zhihu.settings import MONGO_URI, PROJECT_DIR | ||
from zhihu.items import ZhihuPeopleItem, ZhihuRelationItem | ||
|
||
|
||
class ZhihuPipeline(object): | ||
""" | ||
存储数据 | ||
""" | ||
def __init__(self, mongo_uri, mongo_db, image_dir): | ||
self.mongo_uri = mongo_uri | ||
self.mongo_db = mongo_db | ||
self.image_dir = image_dir | ||
self.client = None | ||
self.db= None | ||
|
||
@classmethod | ||
def from_crawler(cls, crawler): | ||
return cls( | ||
mongo_uri=MONGO_URI, | ||
mongo_db='zhihu', | ||
image_dir=os.path.join(PROJECT_DIR, 'images') | ||
) | ||
|
||
def open_spider(self, spider): | ||
self.client = MongoClient(self.mongo_uri) | ||
self.db = self.client[self.mongo_db] | ||
if not os.path.exists(self.image_dir): | ||
os.mkdir(self.image_dir) | ||
|
||
def close_spider(self, spider): | ||
self.client.close() | ||
|
||
def _download_iamge(self, image_url): | ||
""" | ||
下载图片 | ||
""" | ||
try: | ||
image_name = os.path.split(image_url)[-1] | ||
image_path = os.path.join(self.image_dir, image_name) | ||
image = requests.get(image_url, stream=True) | ||
with open(image_path, 'wb') as img: | ||
img.write(image.content) | ||
except Exception as exc: | ||
log.ERROR(exc) | ||
|
||
def _process_people(self, item): | ||
""" | ||
存储用户信息 | ||
""" | ||
collection = self.db['people'] | ||
collection.update({'zhihu_id': item['zhihu_id']}, | ||
dict(item), upsert=True) | ||
|
||
image_url = item['image_url'] | ||
if image_url: | ||
self._download_iamge(image_url) | ||
|
||
def _process_relation(self, item): | ||
""" | ||
存储人机拓扑关系 | ||
""" | ||
collection = self.db['relation'] | ||
|
||
data = collection.find_one({ | ||
'zhihu_id': item['zhihu_id'], | ||
'user_type': item['user_type']}) | ||
if not data: | ||
self.db['relation'].insert(dict(item)) | ||
else: | ||
origin_list = data['user_list'] | ||
new_list = item['user_list'] | ||
data['user_list'] = list(set(origin_list) | set(new_list)) | ||
collection.update({'zhihu_id': item['zhihu_id'], | ||
'user_type': item['user_type']}, data) | ||
|
||
def process_item(self, item, spider): | ||
""" | ||
处理item | ||
""" | ||
if isinstance(item, ZhihuPeopleItem): | ||
self._process_people(item) | ||
elif isinstance(item, ZhihuRelationItem): | ||
self._process_relation(item) | ||
return item |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Scrapy settings for zhihu project | ||
# | ||
# For simplicity, this file contains only settings considered important or | ||
# commonly used. You can find more settings consulting the documentation: | ||
# | ||
# http://doc.scrapy.org/en/latest/topics/settings.html | ||
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html | ||
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html | ||
import os | ||
|
||
BOT_NAME = 'zhihu' | ||
|
||
SPIDER_MODULES = ['zhihu.spiders'] | ||
NEWSPIDER_MODULE = 'zhihu.spiders' | ||
|
||
|
||
# Crawl responsibly by identifying yourself (and your website) on the user-agent | ||
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) ' \ | ||
'AppleWebKit/537.36 (KHTML, like Gecko) ' \ | ||
'Chrome/49.0.2623.87 Safari/537.36' | ||
|
||
# Configure maximum concurrent requests performed by Scrapy (default: 16) | ||
#CONCURRENT_REQUESTS=32 | ||
|
||
# Configure a delay for requests for the same website (default: 0) | ||
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay | ||
# See also autothrottle settings and docs | ||
#DOWNLOAD_DELAY=3 | ||
# The download delay setting will honor only one of: | ||
#CONCURRENT_REQUESTS_PER_DOMAIN=16 | ||
#CONCURRENT_REQUESTS_PER_IP=16 | ||
|
||
# Disable cookies (enabled by default) | ||
COOKIES_ENABLED = True | ||
|
||
# Disable Telnet Console (enabled by default) | ||
#TELNETCONSOLE_ENABLED=False | ||
|
||
# Override the default request headers: | ||
DEFAULT_REQUEST_HEADERS = { | ||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | ||
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4', | ||
'Accept-Encoding': 'gzip, deflate, sdch', | ||
'Connection': 'keep-alive' | ||
} | ||
|
||
# Enable or disable spider middlewares | ||
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html | ||
#SPIDER_MIDDLEWARES = { | ||
# 'zhihu.middlewares.MyCustomSpiderMiddleware': 543, | ||
#} | ||
|
||
# Enable or disable downloader middlewares | ||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html | ||
#DOWNLOADER_MIDDLEWARES = { | ||
# 'zhihu.middlewares.MyCustomDownloaderMiddleware': 543, | ||
#} | ||
|
||
# Enable or disable extensions | ||
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html | ||
#EXTENSIONS = { | ||
# 'scrapy.telnet.TelnetConsole': None, | ||
#} | ||
|
||
# Configure item pipelines | ||
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html | ||
#ITEM_PIPELINES = { | ||
# 'zhihu.pipelines.SomePipeline': 300, | ||
#} | ||
|
||
# Enable and configure the AutoThrottle extension (disabled by default) | ||
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html | ||
# NOTE: AutoThrottle will honour the standard settings for concurrency and delay | ||
#AUTOTHROTTLE_ENABLED=True | ||
# The initial download delay | ||
#AUTOTHROTTLE_START_DELAY=5 | ||
# The maximum download delay to be set in case of high latencies | ||
#AUTOTHROTTLE_MAX_DELAY=60 | ||
# Enable showing throttling stats for every response received: | ||
#AUTOTHROTTLE_DEBUG=False | ||
|
||
# Enable and configure HTTP caching (disabled by default) | ||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings | ||
#HTTPCACHE_ENABLED=True | ||
#HTTPCACHE_EXPIRATION_SECS=0 | ||
#HTTPCACHE_DIR='httpcache' | ||
#HTTPCACHE_IGNORE_HTTP_CODES=[] | ||
#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' | ||
|
||
# 广度优先 | ||
DEPTH_PRIORITY = 1 | ||
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue' | ||
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue' | ||
|
||
# 项目路径 | ||
PROJECT_DIR = os.path.dirname(os.path.abspath(os.path.curdir)) | ||
|
||
# mongodb配置 | ||
MONGO_URI = 'mongodb://localhost:27017' | ||
|
||
# pipeline设置 | ||
ITEM_PIPELINES = { | ||
'zhihu.pipelines.ZhihuPipeline': 500, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# This package will contain the spiders of your Scrapy project | ||
# | ||
# Please refer to the documentation for information on how to create and manage | ||
# your spiders. |
Oops, something went wrong.