Merge pull request LiuRoy#1 from LiuRoy/release/0.0.1

爬去知乎用户数据
bianchengge · Mar 19, 2016 · d6f721e · d6f721e
2 parents d78c5d4 + 8f2cba5
commit d6f721e
Show file tree

Hide file tree

Showing 11 changed files with 531 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.idea/*
+*.pyc
+images
+images/*
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+Scrapy==1.0.3
+pymongo==3.2.2
+requests==2.7.0
diff --git a/zhihu/main.py b/zhihu/main.py
@@ -0,0 +1,4 @@
+# -*- coding=utf8 -*-
+from scrapy import cmdline
+
+cmdline.execute("scrapy crawl zhihu".split())
diff --git a/zhihu/scrapy.cfg b/zhihu/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = zhihu.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = zhihu
diff --git a/zhihu/zhihu/__init__.py b/zhihu/zhihu/__init__.py
diff --git a/zhihu/zhihu/constants.py b/zhihu/zhihu/constants.py
@@ -0,0 +1,35 @@
+# -*- coding=utf8 -*-
+"""
+    常量定义
+"""
+from zhihu.settings import USER_AGENT
+
+class Gender(object):
+    """
+    性别定义
+    """
+    MALE = 1
+    FEMALE = 2
+
+
+class People(object):
+    """
+    人员类型
+    """
+    Followee = 1
+    Follower = 2
+
+
+HEADER = {
+    'Host': 'www.zhihu.com',
+    'Connection': 'keep-alive',
+    'Pragma': 'no-cache',
+    'Cache-Control': 'no-cache',
+    'Accept': '*/*',
+    'Origin': 'https://www.zhihu.com',
+    'X-Requested-With': 'XMLHttpRequest',
+    'User-Agent': USER_AGENT,
+    'Content-Type': 'application/x-www-form-urlencoded',
+    'Accept-Encoding': 'gzip, deflate',
+    'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
+}
diff --git a/zhihu/zhihu/items.py b/zhihu/zhihu/items.py
@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+from scrapy import Item, Field
+
+
+class ZhihuPeopleItem(Item):
+    """知乎用户属性
+
+    Attributes:
+        nickname 用户名
+        zhihu_id 用户id
+        location 位置
+        business 行业
+        gender 性别
+        employment 公司
+        position 职位
+        education 教育情况
+        image_url 头像图片url
+    """
+    nickname = Field()
+    zhihu_id = Field()
+    location = Field()
+    business = Field()
+    gender = Field()
+    employment = Field()
+    position = Field()
+    education = Field()
+    agree_count = Field()
+    thanks_count = Field()
+    followee_count = Field()
+    follower_count = Field()
+    image_url = Field()
+
+
+class ZhihuRelationItem(Item):
+    """知乎用户关系
+
+    Attributes:
+        zhihu_id 知乎id
+        user_list 用户列表
+        user_type 用户类型（1关注的人 2粉丝）
+    """
+    zhihu_id = Field()
+    user_list = Field()
+    user_type = Field()
diff --git a/zhihu/zhihu/pipelines.py b/zhihu/zhihu/pipelines.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import os
+
+import requests
+from pymongo import MongoClient
+from scrapy import log
+
+from zhihu.settings import MONGO_URI, PROJECT_DIR
+from zhihu.items import ZhihuPeopleItem, ZhihuRelationItem
+
+
+class ZhihuPipeline(object):
+    """
+    存储数据
+    """
+    def __init__(self, mongo_uri, mongo_db, image_dir):
+        self.mongo_uri = mongo_uri
+        self.mongo_db = mongo_db
+        self.image_dir = image_dir
+        self.client = None
+        self.db= None
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(
+            mongo_uri=MONGO_URI,
+            mongo_db='zhihu',
+            image_dir=os.path.join(PROJECT_DIR, 'images')
+        )
+
+    def open_spider(self, spider):
+        self.client = MongoClient(self.mongo_uri)
+        self.db = self.client[self.mongo_db]
+        if not os.path.exists(self.image_dir):
+            os.mkdir(self.image_dir)
+
+    def close_spider(self, spider):
+        self.client.close()
+
+    def _download_iamge(self, image_url):
+        """
+        下载图片
+        """
+        try:
+            image_name = os.path.split(image_url)[-1]
+            image_path = os.path.join(self.image_dir, image_name)
+            image = requests.get(image_url, stream=True)
+            with open(image_path, 'wb') as img:
+                img.write(image.content)
+        except Exception as exc:
+            log.ERROR(exc)
+
+    def _process_people(self, item):
+        """
+        存储用户信息
+        """
+        collection = self.db['people']
+        collection.update({'zhihu_id': item['zhihu_id']},
+                          dict(item), upsert=True)
+
+        image_url = item['image_url']
+        if image_url:
+            self._download_iamge(image_url)
+
+    def _process_relation(self, item):
+        """
+        存储人机拓扑关系
+        """
+        collection = self.db['relation']
+
+        data = collection.find_one({
+            'zhihu_id': item['zhihu_id'],
+            'user_type': item['user_type']})
+        if not data:
+            self.db['relation'].insert(dict(item))
+        else:
+            origin_list = data['user_list']
+            new_list = item['user_list']
+            data['user_list'] = list(set(origin_list) | set(new_list))
+            collection.update({'zhihu_id': item['zhihu_id'],
+                               'user_type': item['user_type']}, data)
+
+    def process_item(self, item, spider):
+        """
+        处理item
+        """
+        if isinstance(item, ZhihuPeopleItem):
+            self._process_people(item)
+        elif isinstance(item, ZhihuRelationItem):
+            self._process_relation(item)
+        return item
diff --git a/zhihu/zhihu/settings.py b/zhihu/zhihu/settings.py
@@ -0,0 +1,106 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for zhihu project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+import os
+
+BOT_NAME = 'zhihu'
+
+SPIDER_MODULES = ['zhihu.spiders']
+NEWSPIDER_MODULE = 'zhihu.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) ' \
+             'AppleWebKit/537.36 (KHTML, like Gecko) ' \
+             'Chrome/49.0.2623.87 Safari/537.36'
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS=32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY=3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN=16
+#CONCURRENT_REQUESTS_PER_IP=16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = True
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED=False
+
+# Override the default request headers:
+DEFAULT_REQUEST_HEADERS = {
+  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+  'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
+  'Accept-Encoding': 'gzip, deflate, sdch',
+  'Connection': 'keep-alive'
+}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'zhihu.middlewares.MyCustomSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'zhihu.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'zhihu.pipelines.SomePipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+# NOTE: AutoThrottle will honour the standard settings for concurrency and delay
+#AUTOTHROTTLE_ENABLED=True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY=5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY=60
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG=False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED=True
+#HTTPCACHE_EXPIRATION_SECS=0
+#HTTPCACHE_DIR='httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES=[]
+#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+# 广度优先
+DEPTH_PRIORITY = 1
+SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
+SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
+
+# 项目路径
+PROJECT_DIR = os.path.dirname(os.path.abspath(os.path.curdir))
+
+# mongodb配置
+MONGO_URI = 'mongodb://localhost:27017'
+
+# pipeline设置
+ITEM_PIPELINES = {
+    'zhihu.pipelines.ZhihuPipeline': 500,
+}
diff --git a/zhihu/zhihu/spiders/__init__.py b/zhihu/zhihu/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.