4-11章

emiliazsx · Apr 19, 2017 · 38f6bbd · 38f6bbd
1 parent cf57220
commit 38f6bbd
Show file tree

Hide file tree

Showing 89 changed files with 16,449 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -88,5 +88,5 @@ ENV/
 # Rope project settings
 .ropeproject
 
-./.idea/
+.idea/
 *.pyc
diff --git a/.idea/SpiderBook.iml b/.idea/SpiderBook.iml
diff --git a/ch04/4.1.2.2.html b/ch04/4.1.2.2.html
@@ -0,0 +1,20 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
+    "http://www.w3.org/TR/html4/strict.dtd">
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<title>Firefox测试</title>
+</head>
+<body>
+<script type="text/javascript">
+var a = "Python";
+var b = "爬虫开发";
+document.write(a,b);//网页上输出内容
+console.log(a + b);
+console.debug(a + b);
+console.error(a + b);
+console.info(a + b);
+console.warn(a + b);
+</script>
+</body>
+</html>
diff --git a/ch04/4.1.2.5.html b/ch04/4.1.2.5.html
@@ -0,0 +1,31 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+    <meta charset="utf-8">
+    <script type="text/javascript">
+        function doLogin(){
+            var msg = document.getElementById('message');
+            var username = document.getElementById('username');
+            var password = document.getElementById('password');
+            arrs=[1,2,3,4,5,6,7,8,9];
+            for(var arr in arrs){
+		   msg.innerHTML+=arr+"<br/>"
+            msg.innerHTML+="username->"+username.value
+				+"password->"+password.value+"<br/>"
+            }
+        }
+    </script>
+</head>
+<body>
+    <div>
+        <input id="username" type="text" placeholder="用户名" value=""/>
+        <br/>
+        <input id="password" type="text" placeholder="密码" value=""/>
+        <br/>
+        <input type="button" value="login" onClick="doLogin();"/>
+        <br/>
+        <div id="message"></div>
+    </div>
+</body>
+</html>
diff --git a/ch04/4.2.2.1.py b/ch04/4.2.2.1.py
@@ -0,0 +1,15 @@
+#coding:utf-8
+import re
+# 将正则表达式编译成Pattern对象
+pattern = re.compile(r'\d+')
+# 使用re.match匹配文本，获得匹配结果，无法匹配时将返回None
+result1 = re.match(pattern,'192abc')
+if result1:
+    print result1.group()
+else:
+    print '匹配失败1'
+result2 = re.match(pattern,'abc192')
+if result2:
+    print result2.group()
+else:
+    print '匹配失败2'
diff --git a/ch04/4.2.2.2.py b/ch04/4.2.2.2.py
@@ -0,0 +1,10 @@
+#coding:utf-8
+import re
+# 将正则表达式编译成Pattern对象
+pattern = re.compile(r'\d+')
+# 使用re.match匹配文本，获得匹配结果，无法匹配时将返回None
+result1 = re.search(pattern,'abc192edf')
+if result1:
+    print result1.group()
+else:
+    print '匹配失败1'
diff --git a/ch04/4.2.2.3_7.py b/ch04/4.2.2.3_7.py
@@ -0,0 +1,84 @@
+#coding:utf-8
+'''
+re.split(pattern, string[, maxsplit])
+
+import re
+pattern = re.compile(r'\d+')
+print re.split(pattern,'A1B2C3D4')
+'''
+
+'''
+re. findall (pattern, string[, flags])
+
+import re
+pattern = re.compile(r'\d+')
+print re.findall(pattern,'A1B2C3D4')
+
+'''
+
+'''
+re. finditer (pattern, string[, flags])
+
+import re
+pattern = re.compile(r'\d+')
+matchiter = re.finditer(pattern,'A1B2C3D4')
+for match in matchiter:
+    print match.group()
+
+
+'''
+
+'''
+re. sub(pattern, repl, string[, count])
+
+import re
+p = re.compile(r'(?P<word1>\w+) (?P<word2>\w+)')#使用名称引用
+s = 'i say, hello world!'
+print p.sub(r'\g<word2> \g<word1>', s)
+p = re.compile(r'(\w+) (\w+)')#使用编号
+print p.sub(r'\2 \1', s)
+def func(m):
+    return m.group(1).title() + ' ' + m.group(2).title()
+print p.sub(func, s)
+
+
+'''
+
+'''
+re. subn(pattern, repl, string[, count])
+
+import re
+s = 'i say, hello world!'
+p = re.compile(r'(\w+) (\w+)')
+print p.subn(r'\2 \1', s)
+def func(m):
+    return m.group(1).title() + ' ' + m.group(2).title()
+print p.subn(func, s)
+
+'''
+
+'''
+import re
+pattern = re.compile(r'(\w+) (\w+) (?P<word>.*)')
+match = pattern.match( 'I love you!')
+
+print "match.string:", match.string
+print "match.re:", match.re
+print "match.pos:", match.pos
+print "match.endpos:", match.endpos
+print "match.lastindex:", match.lastindex
+print "match.lastgroup:", match.lastgroup
+
+print "match.group(1,2):", match.group(1, 2)
+print "match.groups():", match.groups()
+print "match.groupdict():", match.groupdict()
+print "match.start(2):", match.start(2)
+print "match.end(2):", match.end(2)
+print "match.span(2):", match.span(2)
+print r"match.expand(r'\2 \1 \3'):", match.expand(r'\2 \1 \3')
+
+
+
+
+
+'''
diff --git a/ch04/4.3.2.py b/ch04/4.3.2.py
@@ -0,0 +1,156 @@
+#coding:utf-8
+import bs4
+from bs4 import BeautifulSoup
+
+html_str = """
+<html><head><title>The Dormouse's story</title></head>
+<body>
+<p class="title"><b>The Dormouse's story</b></p>
+<p class="story">Once upon a time there were three little sisters; and their names were
+<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
+<a href="http://example.com/lacie" class="sister" id="link2"><!-- Lacie --></a> and
+<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
+and they lived at the bottom of a well.</p>
+<p class="story">...</p>
+"""
+soup = BeautifulSoup(html_str,'lxml', from_encoding='utf-8')
+print soup.prettify()
+
+print soup.name
+print soup.title.name
+
+soup.title.name = 'mytitle'
+print soup.title
+print soup.mytitle
+soup.title.name = 'title'
+print soup.p['class']
+print soup.p.get('class')
+
+
+print soup.p.attrs
+soup.p['class']="myClass"
+print soup.p
+
+print soup.p.string
+print type(soup.p.string)
+
+print type(soup.name)
+print soup.name
+print soup.attrs
+
+
+print soup.a.string
+print type(soup.a.string)
+
+if type(soup.a.string)==bs4.element.Comment:
+    print soup.a.string
+
+print soup.head.contents
+print len(soup.head.contents)
+print soup.head.contents[0].string
+for child in soup.head.children:
+    print(child)
+for child in soup.head.descendants:
+    print(child)
+
+
+print soup.head.string
+print soup.title.string
+print soup.html.string
+
+for string in soup.strings:
+    print(repr(string))
+
+for string in soup.stripped_strings:
+    print(repr(string))
+
+print soup.title
+print soup.title.parent
+
+print soup.a
+for parent in soup.a.parents:
+    if parent is None:
+        print(parent)
+    else:
+        print(parent.name)
+
+print soup.p.next_sibling
+print soup.p.prev_sibling
+print soup.p.next_sibling.next_sibling
+
+for sibling in soup.a.next_siblings:
+    print(repr(sibling))
+
+print soup.head
+print soup.head.next_element
+
+for element in soup.a.next_elements:
+    print(repr(element))
+
+print soup.find_all('b')
+
+import re
+for tag in soup.find_all(re.compile("^b")):
+    print(tag.name)
+
+print soup.find_all(["a", "b"])
+
+for tag in soup.find_all(True):
+    print(tag.name)
+def hasClass_Id(tag):
+    return tag.has_attr('class') and tag.has_attr('id')
+print soup.find_all(hasClass_Id)
+
+print soup.find_all(id='link2')
+
+print soup.find_all(href=re.compile("elsie"))
+
+print soup.find_all(id=True)
+print soup.find_all("a", class_="sister")
+
+print soup.find_all(href=re.compile("elsie"), id='link1')
+
+data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')
+data_soup.find_all(attrs={"data-foo": "value"})
+
+
+print soup.find_all(text="Elsie")
+print soup.find_all(text=["Tillie", "Elsie", "Lacie"])
+print soup.find_all(text=re.compile("Dormouse"))
+
+print soup.find_all("a", text="Elsie")
+
+print soup.find_all("a", limit=2)
+
+print soup.find_all("title")
+print soup.find_all("title", recursive=False)
+
+
+#直接查找title标签
+print soup.select("title")
+#逐层查找title标签
+print soup.select("html head title")
+#查找直接子节点
+#查找head下的title标签
+print soup.select("head > title")
+#查找p下的id="link1"的标签
+print soup.select("p > #link1")
+#查找兄弟节点
+#查找id="link1"之后class=sisiter的所有兄弟标签
+print soup.select("#link1 ~ .sister")
+#查找紧跟着id="link1"之后class=sisiter的子标签
+print soup.select("#link1 + .sister")
+
+print soup.select(".sister")
+print soup.select("[class~=sister]")
+
+print soup.select("#link1")
+print soup.select("a#link2")
+
+print soup.select('a[href]')
+
+print soup.select('a[href="http://example.com/elsie"]')
+print soup.select('a[href^="http://example.com/"]')
+print soup.select('a[href$="tillie"]')
+print soup.select('a[href*=".com/el"]')
+
diff --git a/ch04/4.3.3.py b/ch04/4.3.3.py
@@ -0,0 +1,31 @@
+#coding:utf-8
+'''
+from lxml import etree
+html_str = """
+<html><head><title>The Dormouse's story</title></head>
+<body>
+<p class="title"><b>The Dormouse's story</b></p>
+<p class="story">Once upon a time there were three little sisters; and their names were
+<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
+<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
+<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
+and they lived at the bottom of a well.</p>
+<p class="story">...</p>
+"""
+html = etree.HTML(html_str)
+result = etree.tostring(html)
+print(result)
+
+
+'''
+
+'''
+from lxml import etree
+html = etree.parse('index.html')
+result = etree.tostring(html, pretty_print=True)
+print(result)
+
+
+
+
+'''
diff --git a/ch05/5.1.1.py b/ch05/5.1.1.py
@@ -0,0 +1,21 @@
+#coding:utf-8
+import json
+from bs4 import BeautifulSoup
+import requests
+user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
+headers={'User-Agent':user_agent}
+r = requests.get('http://seputu.com/',headers=headers)
+soup = BeautifulSoup(r.text,'html.parser',from_encoding='utf-8')#html.parser
+content=[]
+for mulu in soup.find_all(class_="mulu"):
+    h2 = mulu.find('h2')
+    if h2!=None:
+        h2_title = h2.string#获取标题
+        list=[]
+        for a in mulu.find(class_='box').find_all('a'):#获取所有的a标签中url和章节内容
+            href = a.get('href')
+            box_title = a.get('title')
+            list.append({'href':href,'box_title':box_title})
+        content.append({'title':h2_title,'content':list})
+with open('qiye.json','wb') as fp:
+    json.dump(content,fp=fp,indent=4)