Skip to content

Commit

Permalink
'合并冲突'
Browse files Browse the repository at this point in the history
  • Loading branch information
卜俊杰 committed Mar 9, 2020
2 parents 88c43b8 + 225f54a commit 39c9f41
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 7 deletions.
41 changes: 34 additions & 7 deletions OthertCrawler/0x08fofa/Fofa_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,17 @@


class Fofa():
def __init__(self):
def __init__(self,config):
self.WRITE_MODE = config[
'write_mode'] # 结果信息保存类型,为list形式,可包含txt、csv、json、mongo和mysql五种类型

self.FOFA_USERNAME =config['fofa_username'] # fofa账号用户名
self.FOFA_PASSWORD = config['fofa_password'] # fofa账号密码
self.PAGE = config['page']

self.MONGO_URL = 'localhost'
self.MONGO_DB = 'fofa'
self.MONGO_TABLE = 'message'
self.FOFA_USERNAME = 'xxxxx'
self.FOFA_PASSWORD = 'xxxx'
self.PAGE = 1000

self._init_db()
self._init_browser()
Expand Down Expand Up @@ -135,7 +139,30 @@ def main(self, q):
self.browser.quit()



import os
import sys
import json


def main():
try:
config_path = os.path.split(
os.path.realpath(__file__))[0] + os.sep + 'config.json'
if not os.path.isfile(config_path):
sys.exit(u'当前路径:%s 不存在配置文件config.json' %
(os.path.split(os.path.realpath(__file__))[0] + os.sep))
with open(config_path) as f:
try:
config = json.loads(f.read())
except ValueError:
sys.exit(u'config.json 格式不正确,请参考 ')
fofa = Fofa(config)
fofa.start() # 爬取fofa信息
except Exception as e:
print('Error: ', e)



if __name__ == '__main__':
fa = Fofa()
q = '"博彩" && country=CN'
fa.main(q)
main()
19 changes: 19 additions & 0 deletions OthertCrawler/0x08fofa/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"query": ["redis"],
"fofa_username": "xxxxx",
"fofa_password": "xxxxx",
"page": 100,
"write_mode": ["csv","excle","mysql","txt","mongodb"],
"mysql_config": {
"host": "127.0.0.1",
"port": 3306,
"user": "root",
"password": "123456",
"charset": "utf8mb4"
},
"mongodb_config":{
"mongo_url":"127.0.0.1",
"mongo_db":"fofa",
"mongo_table":"message"
}
}
24 changes: 24 additions & 0 deletions TouTiao/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# 说明

## 关于本程序

- 今日头条爬虫,仓促之下写成,代码不漂亮,但有用。

- 里面有代理部分,可自己修改。

## 关于作者

本人从事 `大数据``数据分析` 工作,欢迎各位大牛叨扰~

- github : [https://github.com/SoliDeoGloria31](https://github.com/SoliDeoGloria31)

- 码云 Gitee : [https://gitee.com/joseph31](https://gitee.com/joseph31)

- 微信 : mortaltiger

<img src="https://gitee.com/joseph31/picture_bed/raw/master/mortaltiger.jpg" width="15%">

- 个人公众号: JosephNest(Joseph 的小窝)
经常测试新功能导致服务器不稳定,可能会出故障, 实现`自动推荐系统``自动回复功能``需求留言功能``人工智能集成(图片识别)``其他功能定制`

<img src="https://gitee.com/joseph31/picture_bed/raw/master/JosephNest.jpg" width="15%">
Binary file added TouTiao/pictures/JosephNest.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added TouTiao/pictures/mortaltiger.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
123 changes: 123 additions & 0 deletions TouTiao/toutiao.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import requests
from requests.exceptions import ConnectionError
from lxml import etree

import time
from selenium import webdriver

from selenium.webdriver.chrome.options import Options
import csv
import pandas as pd
from urllib.parse import quote
import re
from fake_useragent import UserAgent
import random

base_url = 'https://www.toutiao.com/api/search/content/'
timestamp = int(time.time()*1000)

ua = UserAgent(verify_ssl=False)
article_url_list = []
csv_name = pd.read_csv("typhoon_toutiao.csv")

page_urls = ["http://dev.kdlapi.com/testproxy",
"https://dev.kdlapi.com/testproxy",
]

# 隧道服务器
tunnel_host = "tps189.kdlapi.com"
tunnel_port = "15818"

# 隧道用户名密码
tid = "t17888082960619"
password = "gid72p4o"

proxies = {
"http": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port),
"https": "https://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port)
}

# 防止重复
constract_list = []

# 获取到一个页面内所有的article url


def get_article_urls(name):
decde = quote(name)
referer = 'https://www.toutiao.com/search/?keyword='+decde
for offset in range(0, 120, 20): # 搜索结果有10个页面,所以只120,有时页面没这么多
params = {
'aid': 24,
'app_name': 'web_search',
'offset': offset,
'format': 'json',
'keyword': name,
'autoload': 'true',
'count': 20,
'en_qc': 1,
'cur_tab': 1,
'from': 'search_tab',
'pd': 'synthesis',
'timestamp': timestamp
}
headers = {
'cookie': 'tt_webid=6781305717874820616; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6781305717874820616; s_v_web_id=59cfa658a89df645e8a82f1618a81bd0; __tasessionId=g8ptymp5v1579144106433',
'user-agent': ua.random,
'x-requested-with': 'XMLHttpRequest',
'referer': referer,
}
html = requests.get(url=base_url, params=params,
headers=headers, proxies=proxies)
result = list(html.json().get('data'))
for item in result:
article_url = item.get('article_url') # 提取每篇文章的url
if article_url and len(article_url) < 100 and (".mp4" not in article_url) and "toutiao.com" in article_url:
if '/group/' in article_url:
article_url = article_url.replace(
'/group/', '/a').replace('http://', 'https://www.')
article_url_list.append(article_url)
print(article_url)


def request_AND_storage(name):
filename = name+".csv"
try:
get_article_urls(name)
except Exception as e:
print(e)

browser = webdriver.Chrome()

time.sleep(2)
for url in article_url_list:
print(url)
try:
browser.get(url)
time.sleep(1)
text_res = browser.find_element_by_xpath(
'//div[@class="article-box"]')
print(text_res)
text_res = text_res.text
print(text_res)
with open(filename, 'a', encoding='utf-8') as f:
writer = csv.writer(f)
L = [name, text_res]
writer.writerow(L)
except:
continue

browser.close()


if __name__ == '__main__':
try:
request_AND_storage('武汉疫情')
article_url_list = []
time.sleep(10)
except Exception as e:
print(e)
article_url_list = []
time.sleep(1)
continue

0 comments on commit 39c9f41

Please sign in to comment.