forked from qiyeboy/SpiderBook
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
qiyeboy
committed
Apr 19, 2017
1 parent
cf57220
commit 38f6bbd
Showing
89 changed files
with
16,449 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -88,5 +88,5 @@ ENV/ | |
# Rope project settings | ||
.ropeproject | ||
|
||
./.idea/ | ||
.idea/ | ||
*.pyc |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" | ||
"http://www.w3.org/TR/html4/strict.dtd"> | ||
<html> | ||
<head> | ||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> | ||
<title>Firefox测试</title> | ||
</head> | ||
<body> | ||
<script type="text/javascript"> | ||
var a = "Python"; | ||
var b = "爬虫开发"; | ||
document.write(a,b);//网页上输出内容 | ||
console.log(a + b); | ||
console.debug(a + b); | ||
console.error(a + b); | ||
console.info(a + b); | ||
console.warn(a + b); | ||
</script> | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" | ||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | ||
<html xmlns="http://www.w3.org/1999/xhtml"> | ||
<head> | ||
<meta charset="utf-8"> | ||
<script type="text/javascript"> | ||
function doLogin(){ | ||
var msg = document.getElementById('message'); | ||
var username = document.getElementById('username'); | ||
var password = document.getElementById('password'); | ||
arrs=[1,2,3,4,5,6,7,8,9]; | ||
for(var arr in arrs){ | ||
msg.innerHTML+=arr+"<br/>" | ||
msg.innerHTML+="username->"+username.value | ||
+"password->"+password.value+"<br/>" | ||
} | ||
} | ||
</script> | ||
</head> | ||
<body> | ||
<div> | ||
<input id="username" type="text" placeholder="用户名" value=""/> | ||
<br/> | ||
<input id="password" type="text" placeholder="密码" value=""/> | ||
<br/> | ||
<input type="button" value="login" onClick="doLogin();"/> | ||
<br/> | ||
<div id="message"></div> | ||
</div> | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
#coding:utf-8 | ||
import re | ||
# 将正则表达式编译成Pattern对象 | ||
pattern = re.compile(r'\d+') | ||
# 使用re.match匹配文本,获得匹配结果,无法匹配时将返回None | ||
result1 = re.match(pattern,'192abc') | ||
if result1: | ||
print result1.group() | ||
else: | ||
print '匹配失败1' | ||
result2 = re.match(pattern,'abc192') | ||
if result2: | ||
print result2.group() | ||
else: | ||
print '匹配失败2' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#coding:utf-8 | ||
import re | ||
# 将正则表达式编译成Pattern对象 | ||
pattern = re.compile(r'\d+') | ||
# 使用re.match匹配文本,获得匹配结果,无法匹配时将返回None | ||
result1 = re.search(pattern,'abc192edf') | ||
if result1: | ||
print result1.group() | ||
else: | ||
print '匹配失败1' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
#coding:utf-8 | ||
''' | ||
re.split(pattern, string[, maxsplit]) | ||
import re | ||
pattern = re.compile(r'\d+') | ||
print re.split(pattern,'A1B2C3D4') | ||
''' | ||
|
||
''' | ||
re. findall (pattern, string[, flags]) | ||
import re | ||
pattern = re.compile(r'\d+') | ||
print re.findall(pattern,'A1B2C3D4') | ||
''' | ||
|
||
''' | ||
re. finditer (pattern, string[, flags]) | ||
import re | ||
pattern = re.compile(r'\d+') | ||
matchiter = re.finditer(pattern,'A1B2C3D4') | ||
for match in matchiter: | ||
print match.group() | ||
''' | ||
|
||
''' | ||
re. sub(pattern, repl, string[, count]) | ||
import re | ||
p = re.compile(r'(?P<word1>\w+) (?P<word2>\w+)')#使用名称引用 | ||
s = 'i say, hello world!' | ||
print p.sub(r'\g<word2> \g<word1>', s) | ||
p = re.compile(r'(\w+) (\w+)')#使用编号 | ||
print p.sub(r'\2 \1', s) | ||
def func(m): | ||
return m.group(1).title() + ' ' + m.group(2).title() | ||
print p.sub(func, s) | ||
''' | ||
|
||
''' | ||
re. subn(pattern, repl, string[, count]) | ||
import re | ||
s = 'i say, hello world!' | ||
p = re.compile(r'(\w+) (\w+)') | ||
print p.subn(r'\2 \1', s) | ||
def func(m): | ||
return m.group(1).title() + ' ' + m.group(2).title() | ||
print p.subn(func, s) | ||
''' | ||
|
||
''' | ||
import re | ||
pattern = re.compile(r'(\w+) (\w+) (?P<word>.*)') | ||
match = pattern.match( 'I love you!') | ||
print "match.string:", match.string | ||
print "match.re:", match.re | ||
print "match.pos:", match.pos | ||
print "match.endpos:", match.endpos | ||
print "match.lastindex:", match.lastindex | ||
print "match.lastgroup:", match.lastgroup | ||
print "match.group(1,2):", match.group(1, 2) | ||
print "match.groups():", match.groups() | ||
print "match.groupdict():", match.groupdict() | ||
print "match.start(2):", match.start(2) | ||
print "match.end(2):", match.end(2) | ||
print "match.span(2):", match.span(2) | ||
print r"match.expand(r'\2 \1 \3'):", match.expand(r'\2 \1 \3') | ||
''' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
#coding:utf-8 | ||
import bs4 | ||
from bs4 import BeautifulSoup | ||
|
||
html_str = """ | ||
<html><head><title>The Dormouse's story</title></head> | ||
<body> | ||
<p class="title"><b>The Dormouse's story</b></p> | ||
<p class="story">Once upon a time there were three little sisters; and their names were | ||
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, | ||
<a href="http://example.com/lacie" class="sister" id="link2"><!-- Lacie --></a> and | ||
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; | ||
and they lived at the bottom of a well.</p> | ||
<p class="story">...</p> | ||
""" | ||
soup = BeautifulSoup(html_str,'lxml', from_encoding='utf-8') | ||
print soup.prettify() | ||
|
||
print soup.name | ||
print soup.title.name | ||
|
||
soup.title.name = 'mytitle' | ||
print soup.title | ||
print soup.mytitle | ||
soup.title.name = 'title' | ||
print soup.p['class'] | ||
print soup.p.get('class') | ||
|
||
|
||
print soup.p.attrs | ||
soup.p['class']="myClass" | ||
print soup.p | ||
|
||
print soup.p.string | ||
print type(soup.p.string) | ||
|
||
print type(soup.name) | ||
print soup.name | ||
print soup.attrs | ||
|
||
|
||
print soup.a.string | ||
print type(soup.a.string) | ||
|
||
if type(soup.a.string)==bs4.element.Comment: | ||
print soup.a.string | ||
|
||
print soup.head.contents | ||
print len(soup.head.contents) | ||
print soup.head.contents[0].string | ||
for child in soup.head.children: | ||
print(child) | ||
for child in soup.head.descendants: | ||
print(child) | ||
|
||
|
||
print soup.head.string | ||
print soup.title.string | ||
print soup.html.string | ||
|
||
for string in soup.strings: | ||
print(repr(string)) | ||
|
||
for string in soup.stripped_strings: | ||
print(repr(string)) | ||
|
||
print soup.title | ||
print soup.title.parent | ||
|
||
print soup.a | ||
for parent in soup.a.parents: | ||
if parent is None: | ||
print(parent) | ||
else: | ||
print(parent.name) | ||
|
||
print soup.p.next_sibling | ||
print soup.p.prev_sibling | ||
print soup.p.next_sibling.next_sibling | ||
|
||
for sibling in soup.a.next_siblings: | ||
print(repr(sibling)) | ||
|
||
print soup.head | ||
print soup.head.next_element | ||
|
||
for element in soup.a.next_elements: | ||
print(repr(element)) | ||
|
||
print soup.find_all('b') | ||
|
||
import re | ||
for tag in soup.find_all(re.compile("^b")): | ||
print(tag.name) | ||
|
||
print soup.find_all(["a", "b"]) | ||
|
||
for tag in soup.find_all(True): | ||
print(tag.name) | ||
def hasClass_Id(tag): | ||
return tag.has_attr('class') and tag.has_attr('id') | ||
print soup.find_all(hasClass_Id) | ||
|
||
print soup.find_all(id='link2') | ||
|
||
print soup.find_all(href=re.compile("elsie")) | ||
|
||
print soup.find_all(id=True) | ||
print soup.find_all("a", class_="sister") | ||
|
||
print soup.find_all(href=re.compile("elsie"), id='link1') | ||
|
||
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>') | ||
data_soup.find_all(attrs={"data-foo": "value"}) | ||
|
||
|
||
print soup.find_all(text="Elsie") | ||
print soup.find_all(text=["Tillie", "Elsie", "Lacie"]) | ||
print soup.find_all(text=re.compile("Dormouse")) | ||
|
||
print soup.find_all("a", text="Elsie") | ||
|
||
print soup.find_all("a", limit=2) | ||
|
||
print soup.find_all("title") | ||
print soup.find_all("title", recursive=False) | ||
|
||
|
||
#直接查找title标签 | ||
print soup.select("title") | ||
#逐层查找title标签 | ||
print soup.select("html head title") | ||
#查找直接子节点 | ||
#查找head下的title标签 | ||
print soup.select("head > title") | ||
#查找p下的id="link1"的标签 | ||
print soup.select("p > #link1") | ||
#查找兄弟节点 | ||
#查找id="link1"之后class=sisiter的所有兄弟标签 | ||
print soup.select("#link1 ~ .sister") | ||
#查找紧跟着id="link1"之后class=sisiter的子标签 | ||
print soup.select("#link1 + .sister") | ||
|
||
print soup.select(".sister") | ||
print soup.select("[class~=sister]") | ||
|
||
print soup.select("#link1") | ||
print soup.select("a#link2") | ||
|
||
print soup.select('a[href]') | ||
|
||
print soup.select('a[href="http://example.com/elsie"]') | ||
print soup.select('a[href^="http://example.com/"]') | ||
print soup.select('a[href$="tillie"]') | ||
print soup.select('a[href*=".com/el"]') | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#coding:utf-8 | ||
''' | ||
from lxml import etree | ||
html_str = """ | ||
<html><head><title>The Dormouse's story</title></head> | ||
<body> | ||
<p class="title"><b>The Dormouse's story</b></p> | ||
<p class="story">Once upon a time there were three little sisters; and their names were | ||
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, | ||
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and | ||
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; | ||
and they lived at the bottom of a well.</p> | ||
<p class="story">...</p> | ||
""" | ||
html = etree.HTML(html_str) | ||
result = etree.tostring(html) | ||
print(result) | ||
''' | ||
|
||
''' | ||
from lxml import etree | ||
html = etree.parse('index.html') | ||
result = etree.tostring(html, pretty_print=True) | ||
print(result) | ||
''' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#coding:utf-8 | ||
import json | ||
from bs4 import BeautifulSoup | ||
import requests | ||
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' | ||
headers={'User-Agent':user_agent} | ||
r = requests.get('http://seputu.com/',headers=headers) | ||
soup = BeautifulSoup(r.text,'html.parser',from_encoding='utf-8')#html.parser | ||
content=[] | ||
for mulu in soup.find_all(class_="mulu"): | ||
h2 = mulu.find('h2') | ||
if h2!=None: | ||
h2_title = h2.string#获取标题 | ||
list=[] | ||
for a in mulu.find(class_='box').find_all('a'):#获取所有的a标签中url和章节内容 | ||
href = a.get('href') | ||
box_title = a.get('title') | ||
list.append({'href':href,'box_title':box_title}) | ||
content.append({'title':h2_title,'content':list}) | ||
with open('qiye.json','wb') as fp: | ||
json.dump(content,fp=fp,indent=4) |
Oops, something went wrong.