Skip to content

Commit

Permalink
修改readme
Browse files Browse the repository at this point in the history
  • Loading branch information
ResolveWang committed Feb 28, 2017
1 parent cda73b6 commit 1430f3c
Show file tree
Hide file tree
Showing 56 changed files with 90 additions and 34 deletions.
2 changes: 2 additions & 0 deletions .gitattributes
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
*.js linguist-language=Python
*.css linguist-language=Python
*.html linguist-language=Python
*.xml
*.pyc
Empty file modified ReadMe.md
100644 → 100755
Empty file.
3 changes: 2 additions & 1 deletion config/get_config.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ def get_redis_args():

def get_weibo_args():
acounts_info = cf.get('weibo_account')
return random.choice(acounts_info).get('account')
#return random.choice(acounts_info).get('account')
return dict(name='18708103033', password='pmaixq3344')

if __name__ == '__main__':
print(get_weibo_args())
23 changes: 13 additions & 10 deletions config/spider.yaml
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
db:
host: 1.23.4.5
host: 202.115.44.140
port: 1521
user: dbs
password: 12345678
db_name: adb
user: dangban
password: 85418825
db_name: ntci
db_type: oracle

redis_db:
Expand All @@ -14,11 +14,14 @@ redis_db:
# 如果微博账号比较多,也可以考虑使用数据库进行读取和修改(比如标记账号的有效性)
weibo_account:
- account:
name: acount1
password: password1
name: [email protected]
password: rookiefly
- account:
name: acount2
password: password2
name: 15708437303
password: rookiefly
- account:
name: acount3
password: password3
name: [email protected]
password: rookiefly
- account:
name: [email protected]
password: rookiefly
Empty file modified config/sql/spider.sql
100644 → 100755
Empty file.
Empty file modified db_operation/__init__.py
100644 → 100755
Empty file.
2 changes: 1 addition & 1 deletion db_operation/db_connect.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
def get_con():
args = get_db_args()
dsn = cx_Oracle.makedsn(args['host'], args['port'], args['db_name'])
conn = cx_Oracle.connect(args['user'], args['password'], dsn)
conn = cx_Oracle.connect(args['user'], str(args['password']), dsn)
return conn


Expand Down
Empty file modified db_operation/login_info_dao.py
100644 → 100755
Empty file.
Empty file modified db_operation/spread_original_dao.py
100644 → 100755
Empty file.
Empty file modified db_operation/spread_other_dao.py
100644 → 100755
Empty file.
Empty file modified db_operation/user_dao.py
100644 → 100755
Empty file.
Empty file modified db_operation/weibosearch_dao.py
100644 → 100755
Empty file.
Empty file modified do_dataget/__init__.py
100644 → 100755
Empty file.
Empty file modified do_dataget/basic.py
100644 → 100755
Empty file.
Empty file modified do_dataget/get_statusinfo.py
100644 → 100755
Empty file.
Empty file modified do_dataget/get_userinfo.py
100644 → 100755
Empty file.
Empty file modified do_dataprocess/basic.py
100644 → 100755
Empty file.
Empty file modified do_dataprocess/do_searchprocess/__init__.py
100644 → 100755
Empty file.
Empty file modified do_dataprocess/do_searchprocess/search_parse.py
100644 → 100755
Empty file.
Empty file modified do_dataprocess/do_statusprocess/__init__.py
100644 → 100755
Empty file.
Empty file modified do_dataprocess/do_statusprocess/status_parse.py
100644 → 100755
Empty file.
Empty file modified do_dataprocess/get_userprocess/__init__.py
100644 → 100755
Empty file.
Empty file modified do_dataprocess/get_userprocess/get_enterpriseinfo.py
100644 → 100755
Empty file.
Empty file modified do_dataprocess/get_userprocess/get_personalinfo.py
100644 → 100755
Empty file.
Empty file modified do_dataprocess/get_userprocess/get_publicinfo.py
100644 → 100755
Empty file.
Empty file modified do_login/__init__.py
100644 → 100755
Empty file.
15 changes: 5 additions & 10 deletions do_login/login_info.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# -*-coding:utf-8 -*-
# 获取扩散信息
import requests, re, json, os, logging
import requests, re, json, os
import execjs, gl
from config.get_config import get_weibo_args
from logger.log import other


def get_runntime(path):
Expand Down Expand Up @@ -63,9 +64,6 @@ def get_redirect(data, post_url, session):

# 获取成功登陆返回的信息,包括用户id等重要信息,返回登陆session
def get_session():
log_path = os.path.join(os.getcwd(), 'login.log')
logging.basicConfig(filename=log_path, level=logging.INFO, format='[%(asctime)s %(levelname)s] %(message)s',
datefmt='%Y%m%d %H:%M:%S')
session = requests.session()
js_path = os.path.join(os.getcwd(), 'do_login/sinalogin.js')
runntime = get_runntime(js_path)
Expand Down Expand Up @@ -112,16 +110,13 @@ def get_session():
u_pattern = r'"uniqueid":"(.*)",'
m = re.search(u_pattern, login_info)
if m.group(1):
print("你当前使用的是rookiefly实现的微博登陆方式,你的微博id为" + m.group(1))
logging.info('本次登陆账号为:{name}'.format(name=get_weibo_args()['name']))
other.info('本次登陆账号为:{name}'.format(name=get_weibo_args()['name']))
return {'session': session, 'cookie': dict(last_cookies)}
else:
print('登陆失败')
logging.info('本次账号{name}登陆失败'.format(name=get_weibo_args()['name']))
other.info('本次账号{name}登陆失败'.format(name=get_weibo_args()['name']))
return None
else:
print('本次登陆失败')
logging.info('本次账号{name}登陆失败'.format(name=get_weibo_args()['name']))
other.info('本次账号{name}登陆失败'.format(name=get_weibo_args()['name']))
return None


Expand Down
Empty file modified do_login/sinalogin.js
100644 → 100755
Empty file.
Empty file modified get_cookie.py
100644 → 100755
Empty file.
Empty file modified gl.py
100644 → 100755
Empty file.
Empty file modified img/diyu.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified img/kuosan.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified img/reposttime.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified img/sex.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file added logger/__init__.py
Empty file.
59 changes: 59 additions & 0 deletions logger/log.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import logging
import logging.config as log_conf

log_config = {
'version': 1.1,
'formatters': {
'detail': {
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
'datefmt': "%Y-%m-%d %H:%M:%S"
},
'simple': {
'format': '%(name)s - %(levelname)s - %(message)s',
},
},
'handlers': {
'console': {
'class': 'logging.StreamHandler',
'level': 'INFO',
'formatter': 'detail'
},
'file': {
'class': 'logging.handlers.WatchedFileHandler',
'filename': 'log/log.txt',
'level': 'INFO',
'formatter': 'detail',
'encoding': 'utf-8',
},
},
'loggers': {
'crawler': {
'handlers': ['console', 'file'],
'level': 'DEBUG',
},
'parser': {
'handlers': ['file'],
'level': 'INFO',
},
'other': {
'handlers': ['console', 'file'],
'level': 'INFO',
},
'storage': {
'handlers': ['file'],
'level': 'INFO',
}
}
}

log_conf.dictConfig(log_config)

other = logging.getLogger('other')
crawler = logging.getLogger('crawler')
parser = logging.getLogger('page_parser')
storage = logging.getLogger('storage')


__all__ = ['crawler', 'parser', 'other', 'storage']


10 changes: 5 additions & 5 deletions repost_run.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from time import sleep, ctime
from get_cookie import get_session
from task.get_repost import get_all
from utils.util_mtd import display_count
from logger.log import other


if __name__ == '__main__':
Expand All @@ -13,19 +13,19 @@
pw = Process(target=get_session, args=(d,))
pw.daemon = True
pr = Process(target=get_all, args=(d,))
print('本轮抓取开始,开始时间为{endtime}'.format(endtime=ctime()))
other.info('本轮抓取开始,开始时间为{endtime}'.format(endtime=ctime()))

pw.start()
# 防止pr先执行
sleep(90)
sleep(60)
pr.start()
pr.join()

pw.terminate()
print('本轮抓取已经结束,结束时间为{endtime}'.format(endtime=ctime()))
other.info('本轮抓取已经结束,结束时间为{endtime}'.format(endtime=ctime()))
pw.join() # 使其可以更新状态

sleep(60*60)
sleep(2*60*60)



Empty file modified search_run.py
100644 → 100755
Empty file.
Empty file modified task/__init__.py
100644 → 100755
Empty file.
10 changes: 3 additions & 7 deletions task/get_repost.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from do_dataget import get_statusinfo
from do_dataget import get_userinfo
from db_operation import spread_other_dao, weibosearch_dao
from logger.log import other


def _get_reposts(url, session, weibo_mid):
Expand Down Expand Up @@ -224,19 +225,14 @@ def _get_current_reposts(url, session, weibo_mid):


def get_all(d):
log_path = os.path.join(os.getcwd(), 'getdata.log')
logging.basicConfig(filename=log_path, level=logging.INFO, format='[%(asctime)s %(levelname)s] %(message)s',
datefmt='%Y%m%d %H:%M:%S')
datas = weibosearch_dao.get_crawl_urls()
print('一共获取到{len}条需要抓取的微博'.format(len=len(datas)))
logging.info('一共获取到{len}条需要抓取的微博'.format(len=len(datas)))
other.info('一共获取到{len}条需要抓取的微博'.format(len=len(datas)))
for data in datas:
# session放在里面是为了防止某个抓取队列太长或者转发微博太多
session = d['session']
logging.info('正在抓取url为{url}的微博'.format(url=data['url']))
_get_current_reposts(data['url'], session, data['mid'])

# 以下代码是为了测试反爬虫机制注释掉的
# 这样处理会导致某些微博给漏掉
weibosearch_dao.update_weibo_url(data['mid'])

logging.info('本次启动一共抓取了{count}个页面'.format(count=count))
Empty file modified task/get_searchinfo.py
100644 → 100755
Empty file.
Empty file modified task/get_userinfo.py
100644 → 100755
Empty file.
Empty file modified test/__init__.py
100644 → 100755
Empty file.
Empty file modified test/test.py
100644 → 100755
Empty file.
Empty file modified utils/__init__.py
100644 → 100755
Empty file.
Empty file modified utils/util_cls.py
100644 → 100755
Empty file.
Empty file modified utils/util_mtd.py
100644 → 100755
Empty file.
Empty file modified weibo_decorator/__init__.py
100644 → 100755
Empty file.
Empty file modified weibo_decorator/decorators.py
100644 → 100755
Empty file.
Empty file modified weibo_entities/__init__.py
100644 → 100755
Empty file.
Empty file modified weibo_entities/other_and_cache.py
100644 → 100755
Empty file.
Empty file modified weibo_entities/spread_original.py
100644 → 100755
Empty file.
Empty file modified weibo_entities/spread_other.py
100644 → 100755
Empty file.
Empty file modified weibo_entities/spread_other_cache.py
100644 → 100755
Empty file.
Empty file modified weibo_entities/user.py
100644 → 100755
Empty file.
Empty file modified weibo_entities/weibo_search_data.py
100644 → 100755
Empty file.

0 comments on commit 1430f3c

Please sign in to comment.