增加掘金和segmentfault的文章抓取

ylfeng250 · Jan 28, 2018 · 2ac8f9a · 2ac8f9a
1 parent c6b70e3
commit 2ac8f9a
Show file tree

Hide file tree

Showing 3 changed files with 85 additions and 0 deletions.
diff --git a/项目/4.博客/juejin.py b/项目/4.博客/juejin.py
@@ -0,0 +1,40 @@
+import requests
+import re
+import html2text
+from bs4 import BeautifulSoup
+import random
+import os
+
+
+def juejinDownDLoad(url):
+    useragents = [
+        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
+    ]
+    headers = {
+        'Host': 'juejin.im',
+        'Referer': 'https://juejin.im/',
+        'User-Agent': random.choice(useragents)
+    }
+    res = requests.get(url=url,headers=headers).text # 获取整个html
+    h = html2text.HTML2Text()
+    h.ignore_links = False
+    soup = BeautifulSoup(res,'lxml')
+    title = soup.find('title').text
+    print(title)
+    html = soup.find(class_='post-content-container')
+    print(html)
+    # 提取正文并转换成md
+    article = h.handle(str(html))
+    pwd = os.getcwd() # 获取当前文件的路径
+    dirpath = pwd + '/juejin/'
+    if not os.path.exists(dirpath):# 判断目录是否存在，不存在则创建新的目录
+        os.makedirs(dirpath)
+    with open(dirpath+title+'.html','w',encoding='utf8') as f:
+        f.write(str(html)) # 创建html页面
+    with open(dirpath+title+'.md','w',encoding="utf8") as f:
+        f.write(article) # 创建markdown文件
+if __name__ == "__main__":
+    url = "https://juejin.im/post/5a68437b6fb9a01ca47aabc6" # 测试用例
+    juejinDownDLoad(url)
diff --git a/项目/4.博客/readme.md b/项目/4.博客/readme.md
@@ -13,6 +13,10 @@
 
 * jianshu.py 简书博客爬虫
 
+* juejin.py 掘金文章爬虫
+
+* segmentfault.py segmentfault文章爬虫
+
 ## 使用方法举例
 
 ```

diff --git a/项目/4.博客/segmentfault.py b/项目/4.博客/segmentfault.py
@@ -0,0 +1,41 @@
+import requests
+import re
+import random
+import html2text
+import os
+from bs4 import BeautifulSoup
+
+def segmentfaultDownLoad(url):
+    useragents = [
+        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
+    ]
+    headers = {
+        # 'Host': 'https://segmentfault.com',
+        'Referer': 'https://segmentfault.com/',
+        'User-Agent': random.choice(useragents)
+    }
+    h = html2text.HTML2Text()
+    h.ignore_links = False
+
+    res = requests.get(url=url,headers=headers).text
+    soup = BeautifulSoup(res,'html5lib')
+    title = soup.find('title').text # 获取标题
+
+    html = soup.find_(class_='article__content')
+    # 提取正文并转换成md
+    article = h.handle(str(html))
+    pwd = os.getcwd() # 获取当前文件的路径
+    dirpath = pwd + '/segmentfault/'
+    if not os.path.exists(dirpath):# 判断目录是否存在，不存在则创建新的目录
+        os.makedirs(dirpath)
+    with open(dirpath+title+'.html','w',encoding='utf8') as f:
+        f.write(str(html)) # 创建html页面
+    with open(dirpath+title+'.md','w',encoding="utf8") as f:
+        f.write(article) # 创建markdown文件
+
+
+if __name__ == "__main__":
+    url = "https://segmentfault.com/a/1190000011929414" # 测试url
+    segmentfaultDownLoad(url)