Skip to content

Commit

Permalink
Merge pull request LiuRoy#1 from LiuRoy/release/0.0.1
Browse files Browse the repository at this point in the history
爬去知乎用户数据
  • Loading branch information
LiuRoy committed Mar 19, 2016
2 parents d78c5d4 + 8f2cba5 commit d6f721e
Show file tree
Hide file tree
Showing 11 changed files with 531 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.idea/*
*.pyc
images
images/*
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Scrapy==1.0.3
pymongo==3.2.2
requests==2.7.0
4 changes: 4 additions & 0 deletions zhihu/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# -*- coding=utf8 -*-
from scrapy import cmdline

cmdline.execute("scrapy crawl zhihu".split())
11 changes: 11 additions & 0 deletions zhihu/scrapy.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]
default = zhihu.settings

[deploy]
#url = http://localhost:6800/
project = zhihu
Empty file added zhihu/zhihu/__init__.py
Empty file.
35 changes: 35 additions & 0 deletions zhihu/zhihu/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# -*- coding=utf8 -*-
"""
常量定义
"""
from zhihu.settings import USER_AGENT

class Gender(object):
"""
性别定义
"""
MALE = 1
FEMALE = 2


class People(object):
"""
人员类型
"""
Followee = 1
Follower = 2


HEADER = {
'Host': 'www.zhihu.com',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Accept': '*/*',
'Origin': 'https://www.zhihu.com',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': USER_AGENT,
'Content-Type': 'application/x-www-form-urlencoded',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
}
50 changes: 50 additions & 0 deletions zhihu/zhihu/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Item, Field


class ZhihuPeopleItem(Item):
"""知乎用户属性
Attributes:
nickname 用户名
zhihu_id 用户id
location 位置
business 行业
gender 性别
employment 公司
position 职位
education 教育情况
image_url 头像图片url
"""
nickname = Field()
zhihu_id = Field()
location = Field()
business = Field()
gender = Field()
employment = Field()
position = Field()
education = Field()
agree_count = Field()
thanks_count = Field()
followee_count = Field()
follower_count = Field()
image_url = Field()


class ZhihuRelationItem(Item):
"""知乎用户关系
Attributes:
zhihu_id 知乎id
user_list 用户列表
user_type 用户类型(1关注的人 2粉丝)
"""
zhihu_id = Field()
user_list = Field()
user_type = Field()
96 changes: 96 additions & 0 deletions zhihu/zhihu/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os

import requests
from pymongo import MongoClient
from scrapy import log

from zhihu.settings import MONGO_URI, PROJECT_DIR
from zhihu.items import ZhihuPeopleItem, ZhihuRelationItem


class ZhihuPipeline(object):
"""
存储数据
"""
def __init__(self, mongo_uri, mongo_db, image_dir):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
self.image_dir = image_dir
self.client = None
self.db= None

@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=MONGO_URI,
mongo_db='zhihu',
image_dir=os.path.join(PROJECT_DIR, 'images')
)

def open_spider(self, spider):
self.client = MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
if not os.path.exists(self.image_dir):
os.mkdir(self.image_dir)

def close_spider(self, spider):
self.client.close()

def _download_iamge(self, image_url):
"""
下载图片
"""
try:
image_name = os.path.split(image_url)[-1]
image_path = os.path.join(self.image_dir, image_name)
image = requests.get(image_url, stream=True)
with open(image_path, 'wb') as img:
img.write(image.content)
except Exception as exc:
log.ERROR(exc)

def _process_people(self, item):
"""
存储用户信息
"""
collection = self.db['people']
collection.update({'zhihu_id': item['zhihu_id']},
dict(item), upsert=True)

image_url = item['image_url']
if image_url:
self._download_iamge(image_url)

def _process_relation(self, item):
"""
存储人机拓扑关系
"""
collection = self.db['relation']

data = collection.find_one({
'zhihu_id': item['zhihu_id'],
'user_type': item['user_type']})
if not data:
self.db['relation'].insert(dict(item))
else:
origin_list = data['user_list']
new_list = item['user_list']
data['user_list'] = list(set(origin_list) | set(new_list))
collection.update({'zhihu_id': item['zhihu_id'],
'user_type': item['user_type']}, data)

def process_item(self, item, spider):
"""
处理item
"""
if isinstance(item, ZhihuPeopleItem):
self._process_people(item)
elif isinstance(item, ZhihuRelationItem):
self._process_relation(item)
return item
106 changes: 106 additions & 0 deletions zhihu/zhihu/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# -*- coding: utf-8 -*-

# Scrapy settings for zhihu project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
import os

BOT_NAME = 'zhihu'

SPIDER_MODULES = ['zhihu.spiders']
NEWSPIDER_MODULE = 'zhihu.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) ' \
'AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/49.0.2623.87 Safari/537.36'

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS=32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY=3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16

# Disable cookies (enabled by default)
COOKIES_ENABLED = True

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED=False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
'Accept-Encoding': 'gzip, deflate, sdch',
'Connection': 'keep-alive'
}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'zhihu.middlewares.MyCustomSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'zhihu.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'zhihu.pipelines.SomePipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# NOTE: AutoThrottle will honour the standard settings for concurrency and delay
#AUTOTHROTTLE_ENABLED=True
# The initial download delay
#AUTOTHROTTLE_START_DELAY=5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY=60
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG=False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED=True
#HTTPCACHE_EXPIRATION_SECS=0
#HTTPCACHE_DIR='httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES=[]
#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'

# 广度优先
DEPTH_PRIORITY = 1
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'

# 项目路径
PROJECT_DIR = os.path.dirname(os.path.abspath(os.path.curdir))

# mongodb配置
MONGO_URI = 'mongodb://localhost:27017'

# pipeline设置
ITEM_PIPELINES = {
'zhihu.pipelines.ZhihuPipeline': 500,
}
4 changes: 4 additions & 0 deletions zhihu/zhihu/spiders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
Loading

0 comments on commit d6f721e

Please sign in to comment.