Skip to content

Commit

Permalink
4-11章
Browse files Browse the repository at this point in the history
  • Loading branch information
qiyeboy committed Apr 19, 2017
1 parent cf57220 commit 38f6bbd
Show file tree
Hide file tree
Showing 89 changed files with 16,449 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -88,5 +88,5 @@ ENV/
# Rope project settings
.ropeproject

./.idea/
.idea/
*.pyc
1 change: 1 addition & 0 deletions .idea/SpiderBook.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 20 additions & 0 deletions ch04/4.1.2.2.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Firefox测试</title>
</head>
<body>
<script type="text/javascript">
var a = "Python";
var b = "爬虫开发";
document.write(a,b);//网页上输出内容
console.log(a + b);
console.debug(a + b);
console.error(a + b);
console.info(a + b);
console.warn(a + b);
</script>
</body>
</html>
31 changes: 31 additions & 0 deletions ch04/4.1.2.5.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta charset="utf-8">
<script type="text/javascript">
function doLogin(){
var msg = document.getElementById('message');
var username = document.getElementById('username');
var password = document.getElementById('password');
arrs=[1,2,3,4,5,6,7,8,9];
for(var arr in arrs){
msg.innerHTML+=arr+"<br/>"
msg.innerHTML+="username->"+username.value
+"password->"+password.value+"<br/>"
}
}
</script>
</head>
<body>
<div>
<input id="username" type="text" placeholder="用户名" value=""/>
<br/>
<input id="password" type="text" placeholder="密码" value=""/>
<br/>
<input type="button" value="login" onClick="doLogin();"/>
<br/>
<div id="message"></div>
</div>
</body>
</html>
15 changes: 15 additions & 0 deletions ch04/4.2.2.1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#coding:utf-8
import re
# 将正则表达式编译成Pattern对象
pattern = re.compile(r'\d+')
# 使用re.match匹配文本,获得匹配结果,无法匹配时将返回None
result1 = re.match(pattern,'192abc')
if result1:
print result1.group()
else:
print '匹配失败1'
result2 = re.match(pattern,'abc192')
if result2:
print result2.group()
else:
print '匹配失败2'
10 changes: 10 additions & 0 deletions ch04/4.2.2.2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#coding:utf-8
import re
# 将正则表达式编译成Pattern对象
pattern = re.compile(r'\d+')
# 使用re.match匹配文本,获得匹配结果,无法匹配时将返回None
result1 = re.search(pattern,'abc192edf')
if result1:
print result1.group()
else:
print '匹配失败1'
84 changes: 84 additions & 0 deletions ch04/4.2.2.3_7.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#coding:utf-8
'''
re.split(pattern, string[, maxsplit])
import re
pattern = re.compile(r'\d+')
print re.split(pattern,'A1B2C3D4')
'''

'''
re. findall (pattern, string[, flags])
import re
pattern = re.compile(r'\d+')
print re.findall(pattern,'A1B2C3D4')
'''

'''
re. finditer (pattern, string[, flags])
import re
pattern = re.compile(r'\d+')
matchiter = re.finditer(pattern,'A1B2C3D4')
for match in matchiter:
print match.group()
'''

'''
re. sub(pattern, repl, string[, count])
import re
p = re.compile(r'(?P<word1>\w+) (?P<word2>\w+)')#使用名称引用
s = 'i say, hello world!'
print p.sub(r'\g<word2> \g<word1>', s)
p = re.compile(r'(\w+) (\w+)')#使用编号
print p.sub(r'\2 \1', s)
def func(m):
return m.group(1).title() + ' ' + m.group(2).title()
print p.sub(func, s)
'''

'''
re. subn(pattern, repl, string[, count])
import re
s = 'i say, hello world!'
p = re.compile(r'(\w+) (\w+)')
print p.subn(r'\2 \1', s)
def func(m):
return m.group(1).title() + ' ' + m.group(2).title()
print p.subn(func, s)
'''

'''
import re
pattern = re.compile(r'(\w+) (\w+) (?P<word>.*)')
match = pattern.match( 'I love you!')
print "match.string:", match.string
print "match.re:", match.re
print "match.pos:", match.pos
print "match.endpos:", match.endpos
print "match.lastindex:", match.lastindex
print "match.lastgroup:", match.lastgroup
print "match.group(1,2):", match.group(1, 2)
print "match.groups():", match.groups()
print "match.groupdict():", match.groupdict()
print "match.start(2):", match.start(2)
print "match.end(2):", match.end(2)
print "match.span(2):", match.span(2)
print r"match.expand(r'\2 \1 \3'):", match.expand(r'\2 \1 \3')
'''
156 changes: 156 additions & 0 deletions ch04/4.3.2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
#coding:utf-8
import bs4
from bs4 import BeautifulSoup

html_str = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2"><!-- Lacie --></a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_str,'lxml', from_encoding='utf-8')
print soup.prettify()

print soup.name
print soup.title.name

soup.title.name = 'mytitle'
print soup.title
print soup.mytitle
soup.title.name = 'title'
print soup.p['class']
print soup.p.get('class')


print soup.p.attrs
soup.p['class']="myClass"
print soup.p

print soup.p.string
print type(soup.p.string)

print type(soup.name)
print soup.name
print soup.attrs


print soup.a.string
print type(soup.a.string)

if type(soup.a.string)==bs4.element.Comment:
print soup.a.string

print soup.head.contents
print len(soup.head.contents)
print soup.head.contents[0].string
for child in soup.head.children:
print(child)
for child in soup.head.descendants:
print(child)


print soup.head.string
print soup.title.string
print soup.html.string

for string in soup.strings:
print(repr(string))

for string in soup.stripped_strings:
print(repr(string))

print soup.title
print soup.title.parent

print soup.a
for parent in soup.a.parents:
if parent is None:
print(parent)
else:
print(parent.name)

print soup.p.next_sibling
print soup.p.prev_sibling
print soup.p.next_sibling.next_sibling

for sibling in soup.a.next_siblings:
print(repr(sibling))

print soup.head
print soup.head.next_element

for element in soup.a.next_elements:
print(repr(element))

print soup.find_all('b')

import re
for tag in soup.find_all(re.compile("^b")):
print(tag.name)

print soup.find_all(["a", "b"])

for tag in soup.find_all(True):
print(tag.name)
def hasClass_Id(tag):
return tag.has_attr('class') and tag.has_attr('id')
print soup.find_all(hasClass_Id)

print soup.find_all(id='link2')

print soup.find_all(href=re.compile("elsie"))

print soup.find_all(id=True)
print soup.find_all("a", class_="sister")

print soup.find_all(href=re.compile("elsie"), id='link1')

data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')
data_soup.find_all(attrs={"data-foo": "value"})


print soup.find_all(text="Elsie")
print soup.find_all(text=["Tillie", "Elsie", "Lacie"])
print soup.find_all(text=re.compile("Dormouse"))

print soup.find_all("a", text="Elsie")

print soup.find_all("a", limit=2)

print soup.find_all("title")
print soup.find_all("title", recursive=False)


#直接查找title标签
print soup.select("title")
#逐层查找title标签
print soup.select("html head title")
#查找直接子节点
#查找head下的title标签
print soup.select("head > title")
#查找p下的id="link1"的标签
print soup.select("p > #link1")
#查找兄弟节点
#查找id="link1"之后class=sisiter的所有兄弟标签
print soup.select("#link1 ~ .sister")
#查找紧跟着id="link1"之后class=sisiter的子标签
print soup.select("#link1 + .sister")

print soup.select(".sister")
print soup.select("[class~=sister]")

print soup.select("#link1")
print soup.select("a#link2")

print soup.select('a[href]')

print soup.select('a[href="http://example.com/elsie"]')
print soup.select('a[href^="http://example.com/"]')
print soup.select('a[href$="tillie"]')
print soup.select('a[href*=".com/el"]')

31 changes: 31 additions & 0 deletions ch04/4.3.3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#coding:utf-8
'''
from lxml import etree
html_str = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
html = etree.HTML(html_str)
result = etree.tostring(html)
print(result)
'''

'''
from lxml import etree
html = etree.parse('index.html')
result = etree.tostring(html, pretty_print=True)
print(result)
'''
21 changes: 21 additions & 0 deletions ch05/5.1.1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#coding:utf-8
import json
from bs4 import BeautifulSoup
import requests
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers={'User-Agent':user_agent}
r = requests.get('http://seputu.com/',headers=headers)
soup = BeautifulSoup(r.text,'html.parser',from_encoding='utf-8')#html.parser
content=[]
for mulu in soup.find_all(class_="mulu"):
h2 = mulu.find('h2')
if h2!=None:
h2_title = h2.string#获取标题
list=[]
for a in mulu.find(class_='box').find_all('a'):#获取所有的a标签中url和章节内容
href = a.get('href')
box_title = a.get('title')
list.append({'href':href,'box_title':box_title})
content.append({'title':h2_title,'content':list})
with open('qiye.json','wb') as fp:
json.dump(content,fp=fp,indent=4)
Loading

0 comments on commit 38f6bbd

Please sign in to comment.