'合并冲突'

jsf01 · Mar 9, 2020 · 39c9f41 · 39c9f41
2 parents 88c43b8 + 225f54a
commit 39c9f41
Show file tree

Hide file tree

Showing 6 changed files with 200 additions and 7 deletions.
diff --git a/OthertCrawler/0x08fofa/Fofa_spider.py b/OthertCrawler/0x08fofa/Fofa_spider.py
@@ -20,13 +20,17 @@
 
 
 class Fofa():
-    def __init__(self):
+    def __init__(self,config):
+        self.WRITE_MODE = config[
+            'write_mode']  # 结果信息保存类型，为list形式，可包含txt、csv、json、mongo和mysql五种类型
+
+        self.FOFA_USERNAME =config['fofa_username'] # fofa账号用户名
+        self.FOFA_PASSWORD = config['fofa_password'] # fofa账号密码
+        self.PAGE = config['page']
+
         self.MONGO_URL = 'localhost'
         self.MONGO_DB = 'fofa'
         self.MONGO_TABLE = 'message'
-        self.FOFA_USERNAME = 'xxxxx'
-        self.FOFA_PASSWORD = 'xxxx'
-        self.PAGE = 1000
 
         self._init_db()
         self._init_browser()
@@ -135,7 +139,30 @@ def main(self, q):
         self.browser.quit()
 
 
+
+import os
+import sys
+import json
+
+
+def main():
+    try:
+        config_path = os.path.split(
+            os.path.realpath(__file__))[0] + os.sep + 'config.json'
+        if not os.path.isfile(config_path):
+            sys.exit(u'当前路径：%s 不存在配置文件config.json' %
+                     (os.path.split(os.path.realpath(__file__))[0] + os.sep))
+        with open(config_path) as f:
+            try:
+                config = json.loads(f.read())
+            except ValueError:
+                sys.exit(u'config.json 格式不正确，请参考 ')
+        fofa = Fofa(config)
+        fofa.start()  # 爬取fofa信息
+    except Exception as e:
+        print('Error: ', e)
+
+
+
 if __name__ == '__main__':
-    fa = Fofa()
-    q = '"博彩" && country=CN'
-    fa.main(q)
+    main()
diff --git a/OthertCrawler/0x08fofa/config.json b/OthertCrawler/0x08fofa/config.json
@@ -0,0 +1,19 @@
+{
+	"query": ["redis"],
+    "fofa_username": "xxxxx",
+    "fofa_password": "xxxxx",
+    "page": 100,
+    "write_mode": ["csv","excle","mysql","txt","mongodb"],
+    "mysql_config": {
+        "host": "127.0.0.1",
+        "port": 3306,
+        "user": "root",
+        "password": "123456",
+        "charset": "utf8mb4"
+    },
+    "mongodb_config":{
+        "mongo_url":"127.0.0.1",
+        "mongo_db":"fofa",
+        "mongo_table":"message"
+    }
+}
diff --git a/TouTiao/README.md b/TouTiao/README.md
@@ -0,0 +1,24 @@
+# 说明
+
+## 关于本程序
+
+- 今日头条爬虫，仓促之下写成，代码不漂亮，但有用。
+
+- 里面有代理部分，可自己修改。
+
+## 关于作者
+
+本人从事 `大数据`、`数据分析` 工作，欢迎各位大牛叨扰~
+
+- github : [https://github.com/SoliDeoGloria31](https://github.com/SoliDeoGloria31)
+
+- 码云 Gitee : [https://gitee.com/joseph31](https://gitee.com/joseph31)
+
+- 微信 : mortaltiger
+
+  <img src="https://gitee.com/joseph31/picture_bed/raw/master/mortaltiger.jpg" width="15%">
+
+- 个人公众号: JosephNest(Joseph 的小窝)
+  经常测试新功能导致服务器不稳定,可能会出故障, 实现`自动推荐系统`、`自动回复功能`、`需求留言功能`、`人工智能集成（图片识别）`、`其他功能定制`
+
+  <img src="https://gitee.com/joseph31/picture_bed/raw/master/JosephNest.jpg" width="15%">
diff --git a/TouTiao/pictures/JosephNest.jpg b/TouTiao/pictures/JosephNest.jpg
diff --git a/TouTiao/pictures/mortaltiger.jpg b/TouTiao/pictures/mortaltiger.jpg
diff --git a/TouTiao/toutiao.py b/TouTiao/toutiao.py
@@ -0,0 +1,123 @@
+import requests
+from requests.exceptions import ConnectionError
+from lxml import etree
+
+import time
+from selenium import webdriver
+
+from selenium.webdriver.chrome.options import Options
+import csv
+import pandas as pd
+from urllib.parse import quote
+import re
+from fake_useragent import UserAgent
+import random
+
+base_url = 'https://www.toutiao.com/api/search/content/'
+timestamp = int(time.time()*1000)
+
+ua = UserAgent(verify_ssl=False)
+article_url_list = []
+csv_name = pd.read_csv("typhoon_toutiao.csv")
+
+page_urls = ["http://dev.kdlapi.com/testproxy",
+             "https://dev.kdlapi.com/testproxy",
+             ]
+
+# 隧道服务器
+tunnel_host = "tps189.kdlapi.com"
+tunnel_port = "15818"
+
+# 隧道用户名密码
+tid = "t17888082960619"
+password = "gid72p4o"
+
+proxies = {
+    "http": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port),
+    "https": "https://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port)
+}
+
+# 防止重复
+constract_list = []
+
+# 获取到一个页面内所有的article url
+
+
+def get_article_urls(name):
+    decde = quote(name)
+    referer = 'https://www.toutiao.com/search/?keyword='+decde
+    for offset in range(0, 120, 20):  # 搜索结果有10个页面，所以只120，有时页面没这么多
+        params = {
+            'aid': 24,
+            'app_name': 'web_search',
+            'offset': offset,
+            'format': 'json',
+            'keyword': name,
+            'autoload': 'true',
+            'count': 20,
+            'en_qc': 1,
+            'cur_tab': 1,
+            'from': 'search_tab',
+            'pd': 'synthesis',
+            'timestamp': timestamp
+        }
+        headers = {
+            'cookie': 'tt_webid=6781305717874820616; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6781305717874820616; s_v_web_id=59cfa658a89df645e8a82f1618a81bd0; __tasessionId=g8ptymp5v1579144106433',
+            'user-agent': ua.random,
+            'x-requested-with': 'XMLHttpRequest',
+            'referer': referer,
+        }
+        html = requests.get(url=base_url, params=params,
+                            headers=headers, proxies=proxies)
+        result = list(html.json().get('data'))
+        for item in result:
+            article_url = item.get('article_url')   # 提取每篇文章的url
+            if article_url and len(article_url) < 100 and (".mp4" not in article_url) and "toutiao.com" in article_url:
+                if '/group/' in article_url:
+                    article_url = article_url.replace(
+                        '/group/', '/a').replace('http://', 'https://www.')
+                article_url_list.append(article_url)
+                print(article_url)
+
+
+def request_AND_storage(name):
+    filename = name+".csv"
+    try:
+        get_article_urls(name)
+    except Exception as e:
+        print(e)
+
+    browser = webdriver.Chrome()
+
+    time.sleep(2)
+    for url in article_url_list:
+        print(url)
+        try:
+            browser.get(url)
+            time.sleep(1)
+            text_res = browser.find_element_by_xpath(
+                '//div[@class="article-box"]')
+            print(text_res)
+            text_res = text_res.text
+            print(text_res)
+            with open(filename, 'a', encoding='utf-8') as f:
+                writer = csv.writer(f)
+                L = [name, text_res]
+                writer.writerow(L)
+        except:
+            continue
+
+    browser.close()
+
+
+if __name__ == '__main__':
+    try:
+        request_AND_storage('武汉疫情')
+        article_url_list = []
+        time.sleep(10)
+    except Exception as e:
+        print(e)
+        article_url_list = []
+        time.sleep(1)
+        continue
+