forked from qiyeboy/SpiderBook
-
Notifications
You must be signed in to change notification settings - Fork 0
/
4.3.2.py
155 lines (114 loc) · 3.59 KB
/
4.3.2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#coding:utf-8
import bs4
from bs4 import BeautifulSoup
html_str = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2"><!-- Lacie --></a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_str,'lxml', from_encoding='utf-8')
print soup.prettify()
print soup.name
print soup.title.name
soup.title.name = 'mytitle'
print soup.title
print soup.mytitle
soup.mytitle.name = 'title'
print soup.p['class']
print soup.p.get('class')
print soup.p.attrs
soup.p['class']="myClass"
print soup.p
print soup.p.string
print type(soup.p.string)
print type(soup.name)
print soup.name
print soup.attrs
print soup.a.string
print type(soup.a.string)
if type(soup.a.string)==bs4.element.Comment:
print soup.a.string
print soup.head.contents
print len(soup.head.contents)
print soup.head.contents[0].string
for child in soup.head.children:
print(child)
for child in soup.head.descendants:
print(child)
print soup.head.string
print soup.title.string
print soup.html.string
for string in soup.strings:
print(repr(string))
for string in soup.stripped_strings:
print(repr(string))
print soup.title
print soup.title.parent
print soup.a
for parent in soup.a.parents:
if parent is None:
print(parent)
else:
print(parent.name)
print soup.p.next_sibling
print soup.p.prev_sibling
print soup.p.next_sibling.next_sibling
for sibling in soup.a.next_siblings:
print(repr(sibling))
print soup.head
print soup.head.next_element
for element in soup.a.next_elements:
print(repr(element))
print soup.find_all('b')
import re
for tag in soup.find_all(re.compile("^b")):
print(tag.name)
print soup.find_all(["a", "b"])
for tag in soup.find_all(True):
print(tag.name)
def hasClass_Id(tag):
return tag.has_attr('class') and tag.has_attr('id')
print soup.find_all(hasClass_Id)
print soup.find_all(id='link2')
print soup.find_all(href=re.compile("elsie"))
print soup.find_all(id=True)
print soup.find_all("a", class_="sister")
print soup.find_all(href=re.compile("elsie"), id='link1')
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')
data_soup.find_all(attrs={"data-foo": "value"})
print soup.find_all(text="Elsie")
print soup.find_all(text=["Tillie", "Elsie", "Lacie"])
print soup.find_all(text=re.compile("Dormouse"))
print soup.find_all("a", text="Elsie")
print soup.find_all("a", limit=2)
print soup.find_all("title")
print soup.find_all("title", recursive=False)
#直接查找title标签
print soup.select("title")
#逐层查找title标签
print soup.select("html head title")
#查找直接子节点
#查找head下的title标签
print soup.select("head > title")
#查找p下的id="link1"的标签
print soup.select("p > #link1")
#查找兄弟节点
#查找id="link1"之后class=sisiter的所有兄弟标签
print soup.select("#link1 ~ .sister")
#查找紧跟着id="link1"之后class=sisiter的子标签
print soup.select("#link1 + .sister")
print soup.select(".sister")
print soup.select("[class~=sister]")
print soup.select("#link1")
print soup.select("a#link2")
print soup.select('a[href]')
print soup.select('a[href="http://example.com/elsie"]')
print soup.select('a[href^="http://example.com/"]')
print soup.select('a[href$="tillie"]')
print soup.select('a[href*=".com/el"]')