输入源爬虫获取完成

wolfking2 · Apr 9, 2020 · 3de1dae · 3de1dae
1 parent bdb5e5f
commit 3de1dae
Show file tree

Hide file tree

Showing 4 changed files with 72 additions and 11 deletions.
diff --git a/.idea/WebScan.iml b/.idea/WebScan.iml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/README.md b/README.md
@@ -1,6 +1,8 @@
 # SZhe_Scan
 碎遮 Web漏洞扫描器
 
+自动化扫描特定范围web资产，python多线程搜集目标信息，获取页面url链接，对常见高危漏洞进行扫描检测，并可以自己添加POC进行漏洞检测，最后将结果可视化显示在界面上
+
 环境要求:
 
     python版本要求：3.x，python运行需要的类库在requirements.txt中，执行
@@ -13,29 +15,29 @@
 
     一,输入源采集:
 
-        1,基于流量清洗
+        (-)1,基于流量清洗
 
-        2,基于日志提取
+        (-)2,基于日志提取
 
-        3,基于爬虫提取
+        (+)3,基于爬虫提取
 
     二,输入源信息搜集(+)
 
         1,被动信息搜集:(公开渠道可获得信息，与目标系统不产生直接交互)
 
-            1,whois 信息 获取关键注册人的信息 chinaz
+           (+) 1,whois 信息 获取关键注册人的信息 chinaz
 
-            2,在线子域名挖掘(这里不会ban掉自身IP，放进被动信息搜集中)
+           (+) 2,在线子域名挖掘(这里不会ban掉自身IP，放进被动信息搜集中)
 
-            3,绕过CDN查找真实IP
+           (+) 3,绕过CDN查找真实IP
 
-            4,DNS信息搜集
+           (-) 4,DNS信息搜集
 
-            5,旁站查询
+           (+) 5,旁站查询
 
             6,云悉指纹
 
-            7,备案信息
+           (+) 7,备案信息
 
             8,搜索引擎搜索
 

diff --git a/SpiderGetUrl.py b/SpiderGetUrl.py
@@ -0,0 +1,59 @@
+import requests
+from lxml import etree
+from fake_useragent import UserAgent
+import queue
+ua = UserAgent()
+
+
+
+'''
+因为每深入一层，链接数增大很多，所以截止层数暂定为2，添加多线程之后将层数提高
+爬取截止条件为：层数为2，或者队列中无新的链接
+返回链接列表
+参考链接:
+    https://www.hss5.com/2018/11/28/python%E7%88%AC%E5%8F%96%E7%BD%91%E7%AB%99%E5%85%A8%E9%83%A8url%E9%93%BE%E6%8E%A5/
+    https://ask.hellobi.com/blog/bixtcexs/11983
+    https://lskreno.vip/2019/09/15/%E7%88%AC%E8%99%AB%E4%B9%8B%E6%88%98/
+    https://github.com/sml2h3/python_collect_domain/blob/master/collect.py
+'''
+def spider(url):
+    headers = {'User-Agent': ua.random}
+    new_url_list = []
+    try:
+        rep = requests.get(url,headers=headers,timeout=1.5)
+        rep = etree.HTML(rep.text)
+        url_list = rep.xpath('//*[@href]/@href')
+
+        for i in url_list:
+            if "http" in i:
+                new_url_list.append(i)
+            else:
+                new_url_list.append(url + i)
+    except:
+        pass
+
+    return new_url_list
+'''
+利用三个列表进行有层次地广度遍历url:all_lists储存所有获取到的url,new_lists储存这一层遍历时获取到的所有新的url，old_lists储存上一层的所有url
+用于下层的遍历
+'''
+def depth_get(url):
+    count=0
+    all_lists=[]
+    new_lists=[]
+    new_lists.append(url)
+    while(count<2):
+        count+=1
+        print("第%d层"%count+20*"=")
+        old_lists=new_lists
+        new_lists=[]
+        for node in old_lists:
+            new_lists+=spider(node)
+        all_lists+=new_lists
+    all_lists = list(set(all_lists))
+    # for i in all_lists:
+    #     print(i)
+    return all_lists
+
+#测试数据
+#depth_get("https://ask.hellobi.com/blog/bixtcexs/11983")