Skip to content

Commit

Permalink
增加掘金和segmentfault的文章抓取
Browse files Browse the repository at this point in the history
  • Loading branch information
ylfeng250 committed Jan 28, 2018
1 parent c6b70e3 commit 2ac8f9a
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 0 deletions.
40 changes: 40 additions & 0 deletions 项目/4.博客/juejin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import requests
import re
import html2text
from bs4 import BeautifulSoup
import random
import os


def juejinDownDLoad(url):
useragents = [
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
]
headers = {
'Host': 'juejin.im',
'Referer': 'https://juejin.im/',
'User-Agent': random.choice(useragents)
}
res = requests.get(url=url,headers=headers).text # 获取整个html
h = html2text.HTML2Text()
h.ignore_links = False
soup = BeautifulSoup(res,'lxml')
title = soup.find('title').text
print(title)
html = soup.find(class_='post-content-container')
print(html)
# 提取正文并转换成md
article = h.handle(str(html))
pwd = os.getcwd() # 获取当前文件的路径
dirpath = pwd + '/juejin/'
if not os.path.exists(dirpath):# 判断目录是否存在,不存在则创建新的目录
os.makedirs(dirpath)
with open(dirpath+title+'.html','w',encoding='utf8') as f:
f.write(str(html)) # 创建html页面
with open(dirpath+title+'.md','w',encoding="utf8") as f:
f.write(article) # 创建markdown文件
if __name__ == "__main__":
url = "https://juejin.im/post/5a68437b6fb9a01ca47aabc6" # 测试用例
juejinDownDLoad(url)
4 changes: 4 additions & 0 deletions 项目/4.博客/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@

* jianshu.py 简书博客爬虫

* juejin.py 掘金文章爬虫

* segmentfault.py segmentfault文章爬虫

## 使用方法举例

```
Expand Down
41 changes: 41 additions & 0 deletions 项目/4.博客/segmentfault.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import requests
import re
import random
import html2text
import os
from bs4 import BeautifulSoup

def segmentfaultDownLoad(url):
useragents = [
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
]
headers = {
# 'Host': 'https://segmentfault.com',
'Referer': 'https://segmentfault.com/',
'User-Agent': random.choice(useragents)
}
h = html2text.HTML2Text()
h.ignore_links = False

res = requests.get(url=url,headers=headers).text
soup = BeautifulSoup(res,'html5lib')
title = soup.find('title').text # 获取标题

html = soup.find_(class_='article__content')
# 提取正文并转换成md
article = h.handle(str(html))
pwd = os.getcwd() # 获取当前文件的路径
dirpath = pwd + '/segmentfault/'
if not os.path.exists(dirpath):# 判断目录是否存在,不存在则创建新的目录
os.makedirs(dirpath)
with open(dirpath+title+'.html','w',encoding='utf8') as f:
f.write(str(html)) # 创建html页面
with open(dirpath+title+'.md','w',encoding="utf8") as f:
f.write(article) # 创建markdown文件


if __name__ == "__main__":
url = "https://segmentfault.com/a/1190000011929414" # 测试url
segmentfaultDownLoad(url)

0 comments on commit 2ac8f9a

Please sign in to comment.