forked from jackfrued/Python-100-Days
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
137 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
from urllib.error import URLError | ||
from urllib.request import urlopen | ||
|
||
import re | ||
import pymysql | ||
|
||
|
||
def get_page_code(start_url, *, retry_times=3, charsets=('utf-8', )): | ||
try: | ||
for charset in charsets: | ||
try: | ||
html = urlopen(start_url).read().decode(charset) | ||
break | ||
except UnicodeDecodeError: | ||
html = None | ||
except URLError as ex: | ||
print('Error:', ex) | ||
return get_page_code(start_url, retry_times=retry_times - 1, charsets=charsets) if \ | ||
retry_times > 0 else None | ||
return html | ||
|
||
|
||
def main(): | ||
url_list = ['http://sports.sohu.com/nba_a.shtml'] | ||
visited_list = set({}) | ||
while len(url_list) > 0: | ||
current_url = url_list.pop(0) | ||
visited_list.add(current_url) | ||
print(current_url) | ||
html = get_page_code(current_url, charsets=('utf-8', 'gbk', 'gb2312')) | ||
if html: | ||
link_regex = re.compile(r'<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) | ||
link_list = re.findall(link_regex, html) | ||
url_list += link_list | ||
conn = pymysql.connect(host='localhost', port=3306, | ||
db='crawler', user='root', | ||
passwd='123456', charset='utf8') | ||
try: | ||
for link in link_list: | ||
if link not in visited_list: | ||
visited_list.add(link) | ||
print(link) | ||
html = get_page_code(link, charsets=('utf-8', 'gbk', 'gb2312')) | ||
if html: | ||
title_regex = re.compile(r'<h1>(.*)<span', re.IGNORECASE) | ||
match_list = title_regex.findall(html) | ||
if len(match_list) > 0: | ||
title = match_list[0] | ||
with conn.cursor() as cursor: | ||
cursor.execute('insert into tb_result (rtitle, rurl) values (%s, %s)', | ||
(title, link)) | ||
conn.commit() | ||
finally: | ||
conn.close() | ||
print('执行完成!') | ||
|
||
|
||
if __name__ == '__main__': | ||
main() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
from bs4 import BeautifulSoup | ||
|
||
import re | ||
|
||
|
||
def main(): | ||
html = """ | ||
<!DOCTYPE html> | ||
<html lang="en"> | ||
<head> | ||
<meta charset="UTF-8"> | ||
<title>首页</title> | ||
</head> | ||
<body> | ||
<h1>Hello, world!</h1> | ||
<p>Good!!!</p> | ||
<hr> | ||
<div> | ||
<h2>这是一个例子程序</h2> | ||
<p>静夜思</p> | ||
<p class="foo">床前明月光</p> | ||
<p id="bar">疑似地上霜</p> | ||
<p class="foo">举头望明月</p> | ||
<div><a href="http://www.baidu.com"><p>低头思故乡</p></a></div> | ||
</div> | ||
<a class="foo" href="http://www.qq.com">腾讯网</a> | ||
<img src="./img/pretty-girl.png" alt="美女"> | ||
<img src="./img/hellokitty.png" alt="凯蒂猫"> | ||
<img src="./static/img/pretty-girl.png" alt="美女"> | ||
<goup>Hello, Goup!</goup> | ||
</body> | ||
</html> | ||
""" | ||
# resp = requests.get('http://sports.sohu.com/nba_a.shtml') | ||
# html = resp.content.decode('gbk') | ||
soup = BeautifulSoup(html, 'lxml') | ||
print(soup.title) | ||
# JavaScript: document.body.h1 | ||
# JavaScript: document.forms[0] | ||
print(soup.body.h1) | ||
print(soup.find_all(re.compile(r'p$'))) | ||
print(soup.find_all('img', {'src': re.compile(r'\./img/\w+.png')})) | ||
print(soup.find_all(lambda x: len(x.attrs) == 2)) | ||
print(soup.find_all('p', {'class': 'foo'})) | ||
for elem in soup.select('a[href]'): | ||
print(elem.attrs['href']) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from bs4 import BeautifulSoup | ||
|
||
import requests | ||
|
||
import re | ||
|
||
|
||
def main(): | ||
# 通过requests第三方库的get方法获取页面 | ||
resp = requests.get('http://sports.sohu.com/nba_a.shtml') | ||
# 对响应的字节串(bytes)进行解码操作(搜狐的部分页面使用了GBK编码) | ||
html = resp.content.decode('gbk') | ||
# 创建BeautifulSoup对象来解析页面(相当于JavaScript的DOM) | ||
bs = BeautifulSoup(html, 'lxml') | ||
# 通过CSS选择器语法查找元素并通过循环进行处理 | ||
# for elem in bs.find_all(lambda x: 'test' in x.attrs): | ||
for elem in bs.select('a[test]'): | ||
# 通过attrs属性(字典)获取元素的属性值 | ||
link_url = elem.attrs['href'] | ||
resp = requests.get(link_url) | ||
bs_sub = BeautifulSoup(resp.text, 'lxml') | ||
# 使用正则表达式对获取的数据做进一步的处理 | ||
print(re.sub(r'[\r\n]', '', bs_sub.find('h1').text)) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |