Initial commit of code from book

PeiGuancheng · Jun 15, 2015 · 4260ca4 · 4260ca4
1 parent e939535
commit 4260ca4
Show file tree

Hide file tree

Showing 98 changed files with 1,629 additions and 0 deletions.
diff --git a/chapter1/.DS_Store b/chapter1/.DS_Store
diff --git a/chapter1/1-basicExample.py b/chapter1/1-basicExample.py
@@ -0,0 +1,3 @@
+from urllib.request import urlopen
+html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
+print(html.read())
diff --git a/chapter1/2-beautifulSoup.py b/chapter1/2-beautifulSoup.py
@@ -0,0 +1,6 @@
+from urllib.request import urlopen
+from bs4 import BeautifulSoup
+
+html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
+bsObj = BeautifulSoup(html.read());
+print(bsObj.h1)
diff --git a/chapter1/3-exceptionHandling.py b/chapter1/3-exceptionHandling.py
@@ -0,0 +1,26 @@
+from urllib.request import urlopen
+from urllib.error import HTTPError
+from bs4 import BeautifulSoup
+import sys
+
+
+def getTitle(url):
+    try:
+        html = urlopen(url)
+    except HTTPError as e:
+        print(e)
+        return None
+    try:
+        bsObj = BeautifulSoup(html.read())
+        title = bsObj.body.h1
+    except AttributeError as e:
+        return None
+    return title
+
+title = getTitle("http://www.pythonscraping.com/exercises/exercise1.html")
+if title == None:
+    print("Title could not be found")
+else:
+    print(title)
+
+
diff --git a/chapter10/1-seleniumBasic.py b/chapter10/1-seleniumBasic.py
@@ -0,0 +1,8 @@
+from selenium import webdriver
+import time
+
+driver = webdriver.PhantomJS(executable_path='')
+driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
+time.sleep(3)
+print(driver.find_element_by_id("content").text)
+driver.close()
diff --git a/chapter10/2-waitForLoad.py b/chapter10/2-waitForLoad.py
@@ -0,0 +1,11 @@
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+driver = webdriver.PhantomJS(executable_path='')
+driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
+try:
+    element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "loadedButton")))
+finally:
+    print(driver.find_element_by_id("content").text)
+    driver.close()
diff --git a/chapter10/3-javascriptRedirect.py b/chapter10/3-javascriptRedirect.py
@@ -0,0 +1,23 @@
+from selenium import webdriver
+import time
+from selenium.webdriver.remote.webelement import WebElement
+from selenium.common.exceptions import StaleElementReferenceException
+
+def waitForLoad(driver):
+    elem = driver.find_element_by_tag_name("html")
+    count = 0
+    while True:
+        count += 1
+        if count > 20:
+            print("Timing out after 10 seconds and returning")
+            return
+        time.sleep(.5)
+        try:
+            elem == driver.find_element_by_tag_name("html")
+        except StaleElementReferenceException:
+            return
+
+driver = webdriver.PhantomJS(executable_path='<Path to Phantom JS>')
+driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")
+waitForLoad(driver)
+print(driver.page_source)
diff --git a/chapter11/1-basicImage.py b/chapter11/1-basicImage.py
@@ -0,0 +1,6 @@
+from PIL import Image, ImageFilter
+
+kitten = Image.open("kitten.jpg")
+blurryKitten = kitten.filter(ImageFilter.GaussianBlur)
+blurryKitten.save("kitten_blurred.jpg")
+blurryKitten.show()
diff --git a/chapter11/2-cleanImage.py b/chapter11/2-cleanImage.py
@@ -0,0 +1,19 @@
+from PIL import Image
+import subprocess
+
+def cleanFile(filePath, newFilePath):
+    image = Image.open(filePath)
+
+    #Set a threshold value for the image, and save
+    image = image.point(lambda x: 0 if x<143 else 255)
+    image.save(newFilePath)
+
+    #call tesseract to do OCR on the newly created image
+    subprocess.call(["tesseract", newFilePath, "output"])
+
+    #Open and read the resulting data file
+    outputFile = open("output.txt", 'r')
+    print(outputFile.read())
+    outputFile.close()
+
+cleanFile("text_2.png", "text_2_clean.png")
diff --git a/chapter11/3-readWebImages.py b/chapter11/3-readWebImages.py
@@ -0,0 +1,36 @@
+import time
+from urllib.request import urlretrieve
+import subprocess
+from selenium import webdriver
+
+driver = webdriver.PhantomJS(executable_path='/Users/ryan/Documents/pythonscraping/code/headless/phantomjs-1.9.8-macosx/bin/phantomjs')
+#driver = webdriver.Firefox()
+driver.get("http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200")
+time.sleep(2)
+
+driver.find_element_by_id("sitbLogoImg").click()
+#The easiest way to get exactly one of every page
+imageList = set()
+
+#Wait for the page to load
+time.sleep(10)
+print(driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style"))
+while "pointer" in driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style"):
+    #While we can click on the right arrow, move through the pages
+    driver.find_element_by_id("sitbReaderRightPageTurner").click()
+    time.sleep(2)
+    #Get any new pages that have loaded (multiple pages can load at once)
+    pages = driver.find_elements_by_xpath("//div[@class='pageImage']/div/img")
+    for page in pages:
+        image = page.get_attribute("src")
+        imageList.add(image)
+
+driver.quit()
+
+#Start processing the images we've collected URLs for with Tesseract
+for image in sorted(imageList):
+    urlretrieve(image, "page.jpg")
+    p = subprocess.Popen(["tesseract", "page.jpg", "page"], stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+    p.wait()
+    f = open("page.txt", "r")
+    print(f.read())
diff --git a/chapter11/4-solveCaptcha.py b/chapter11/4-solveCaptcha.py
@@ -0,0 +1,48 @@
+from urllib.request import urlretrieve
+from urllib.request import urlopen
+from bs4 import BeautifulSoup
+import subprocess
+import requests
+from PIL import Image
+from PIL import ImageOps
+
+def cleanImage(imagePath):
+    image = Image.open(imagePath)
+    image = image.point(lambda x: 0 if x<143 else 255)
+    borderImage = ImageOps.expand(image,border=20,fill='white')
+    borderImage.save(imagePath)
+
+html = urlopen("http://www.pythonscraping.com/humans-only")
+bsObj = BeautifulSoup(html)
+#Gather prepopulated form values
+imageLocation = bsObj.find("img", {"title": "Image CAPTCHA"})["src"]
+formBuildId = bsObj.find("input", {"name":"form_build_id"})["value"]
+captchaSid = bsObj.find("input", {"name":"captcha_sid"})["value"]
+captchaToken = bsObj.find("input", {"name":"captcha_token"})["value"]
+
+captchaUrl = "http://pythonscraping.com"+imageLocation
+urlretrieve(captchaUrl, "captcha.jpg")
+cleanImage("captcha.jpg")
+p = subprocess.Popen(["tesseract", "captcha.jpg", "captcha"], stdout=
+    subprocess.PIPE,stderr=subprocess.PIPE)
+p.wait()
+f = open("captcha.txt", "r")
+
+#Clean any whitespace characters
+captchaResponse = f.read().replace(" ", "").replace("\n", "")
+print("Captcha solution attempt: "+captchaResponse)
+
+if len(captchaResponse) == 5:
+    params = {"captcha_token":captchaToken, "captcha_sid":captchaSid,   
+              "form_id":"comment_node_page_form", "form_build_id": formBuildId, 
+                  "captcha_response":captchaResponse, "name":"Ryan Mitchell", 
+                  "subject": "I come to seek the Grail", 
+                  "comment_body[und][0][value]": 
+                                           "...and I am definitely not a bot"}
+    r = requests.post("http://www.pythonscraping.com/comment/reply/10", 
+                          data=params)
+    responseObj = BeautifulSoup(r.text)
+    if responseObj.find("div", {"class":"messages"}) is not None:
+        print(responseObj.find("div", {"class":"messages"}).get_text())
+else:
+    print("There was a problem reading the CAPTCHA correctly!")
diff --git a/chapter11/ghostdriver.log b/chapter11/ghostdriver.log
@@ -0,0 +1,23 @@
+PhantomJS is launching GhostDriver...
+[INFO  - 2015-06-15T00:08:45.592Z] GhostDriver - Main - running on port 51799
+[INFO  - 2015-06-15T00:08:46.231Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) PhantomJS/1.9.8 Safari/534.34","webSecurityEnabled":true}
+[INFO  - 2015-06-15T00:08:46.231Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.customHeaders:  - {}
+[INFO  - 2015-06-15T00:08:46.231Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"1.9.8","driverName":"ghostdriver","driverVersion":"1.1.0","platform":"mac-10.9 (Mavericks)-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}}
+[INFO  - 2015-06-15T00:08:46.231Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9
+[ERROR - 2015-06-15T00:08:47.864Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.onError - msg: ReferenceError: Can't find variable: ue
+[ERROR - 2015-06-15T00:08:47.864Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.onError - stack:
+  (anonymous function) (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:10202)
+  (anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:940)
+  (anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:807)
+  (anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:696)
+  (anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:784)
+  (anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:959)
+  (anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:161)
+  (anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:70)
+  (anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:960)
+  (anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:2615)
+  (anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:2640)
+  (anonymous function) (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:10203)
+[ERROR - 2015-06-15T00:08:47.865Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.onError - msg: TypeError: 'null' is not an object (evaluating 'old_error_handler.apply')
+[ERROR - 2015-06-15T00:08:47.865Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.onError - stack:
+  dpOnErrorOverride (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:1186)
diff --git a/chapter11/page.jpg b/chapter11/page.jpg
diff --git a/chapter11/page.txt b/chapter11/page.txt
@@ -0,0 +1,47 @@
+WEI‘ nrrd Peace
+Len Nlkelayevldu Iolﬂuy
+
+Readmg shmdd be ax
+wlnvame asnossxble Wenﬂer
+an mm m our cram: Llhvary
+
+— Leo Tmsloy was a Russian rwovelwst
+I and moval phﬂmopher med lur
+A ms Ideas 01 nonviolenx reswslance m 5 We range 0, “and”
+
+M.   
+known for ms genevosxly to the '°'”"“‘ ‘”' "*°’“‘* W‘"'
+
+reading dwnmuie,—a|\
+‘L  °““"“‘ opnmizedfarreadabIh(y—so
+readerscan tho the fnrmat
+
+ms best knawn nuvckave “War and
+
+Peace" (1869),w>v(h msmy regarded as an em ‘”“”L"‘5"‘°"‘ W‘ 599 ""19
+vamer man a novel, and "Anna Kavemna" mm) “” '“""
+
+Nxswark was admued m hiswne by Doxlayevxky,
+
+Chxkoxa Turgenev, and Flauben and Water by Tm EaSyREad super
+
+wgmiz wow and )ame§Joy<e Large 24 Eamon ‘S
+"we: and Peace" 2 bnlhant prose epm by Tolstoy °'.“'m'Zed for readers
+He pmsemsthe mm; M1892 axe mom msaae. "‘{"h ‘eVe'9‘Y "Ed“<9d
+when m: Russwanswon agamsune Napo\zcn>c vlsxon
+onshuqhtlhrmlghlhevadherenzr(owrme He
+tamures the essenze af Ne wwlh an R5 mancnges,
+hardsmps andmys The maepm poV\raya\ at
+numemus thataclerx lrom an xpheves m We wands
+u unparalleled reahsm Aime dassm‘
+
+
+
+su+u27o3ro2ar1
+
+||I||| H
+
+cuvmumeu .21mna
+
+
+
diff --git a/chapter12/1-headers.py b/chapter12/1-headers.py
@@ -0,0 +1,14 @@
+import requests
+from bs4 import BeautifulSoup
+
+session = requests.Session()
+headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)
+                         AppleWebKit 537.36 (KHTML, like Gecko) Chrome",
+           "Accept":"text/html,application/xhtml+xml,application/xml;
+                     q=0.9,image/webp,*/*;q=0.8"}
+url = "https://www.whatismybrowser.com/
+       developers/what-http-headers-is-my-browser-sending"
+req = session.get(url, headers=headers)
+
+bsObj = BeautifulSoup(req.text)
+print(bsObj.find("table",{"class":"table-striped"}).get_text)
diff --git a/chapter12/2-seleniumCookies.py b/chapter12/2-seleniumCookies.py
@@ -0,0 +1,18 @@
+from selenium import webdriver
+
+driver = webdriver.PhantomJS(executable_path='<Path to Phantom JS>')
+driver.get("http://pythonscraping.com")
+driver.implicitly_wait(1)
+print(driver.get_cookies())
+
+savedCookies = driver.get_cookies()
+
+driver2 = webdriver.PhantomJS(executable_path='<Path to Phantom JS>')
+driver2.get("http://pythonscraping.com")
+driver2.delete_all_cookies()
+for cookie in savedCookies:
+    driver2.add_cookie(cookie)
+
+driver2.get("http://pythonscraping.com")
+driver.implicitly_wait(1)
+print(driver2.get_cookies())
diff --git a/chapter12/3-honeypotDetection.py b/chapter12/3-honeypotDetection.py
@@ -0,0 +1,14 @@
+from selenium import webdriver
+from selenium.webdriver.remote.webelement import WebElement
+
+driver = webdriver.PhantomJS(executable_path='')
+driver.get("http://pythonscraping.com/pages/itsatrap.html")
+links = driver.find_elements_by_tag_name("a")
+for link in links:
+    if not link.is_displayed():
+        print("The link "+link.get_attribute("href")+" is a trap")
+
+fields = driver.find_elements_by_tag_name("input")
+for field in fields:
+    if not field.is_displayed():
+        print("Do not change value of "+field.get_attribute("name"))
diff --git a/chapter13/1-wikiUnitTest.py b/chapter13/1-wikiUnitTest.py
@@ -0,0 +1,52 @@
+from urllib.request import urlopen
+from urllib.parse import unquote
+import random
+import re
+from bs4 import BeautifulSoup
+import unittest
+
+class TestWikipedia(unittest.TestCase):
+
+    bsObj = None
+    url = None
+
+
+    def test_PageProperties(self):
+        global bsObj
+        global url
+
+        url = "http://en.wikipedia.org/wiki/Monty_Python"
+        #Test the first 100 pages we encounter
+        for i in range(1, 100):
+            bsObj = BeautifulSoup(urlopen(url))
+            titles = self.titleMatchesURL()
+            self.assertEquals(titles[0], titles[1])
+            self.assertTrue(self.contentExists())
+            url = self.getNextLink()
+        print("Done!")
+
+    def titleMatchesURL(self):
+        global bsObj
+        global url
+        pageTitle = bsObj.find("h1").get_text()
+        urlTitle = url[(url.index("/wiki/")+6):]
+        urlTitle = urlTitle.replace("_", " ")
+        urlTitle = unquote(urlTitle)
+        return [pageTitle.lower(), urlTitle.lower()]
+
+    def contentExists(self):
+        global bsObj
+        content = bsObj.find("div",{"id":"mw-content-text"})
+        if content is not None:
+            return True
+        return False
+
+    def getNextLink(self):
+        global bsObj
+        links = bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
+        link = links[random.randint(0, len(links)-1)].attrs['href']
+        print("Next link is: "+link)
+        return "http://en.wikipedia.org"+link
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/chapter13/2-wikiSeleniumTest.py b/chapter13/2-wikiSeleniumTest.py
@@ -0,0 +1,8 @@
+from selenium import webdriver
+
+
+driver = webdriver.PhantomJS(executable_path='/Users/ryan/Documents/pythonscraping/code/headless/phantomjs-1.9.8-macosx/bin/phantomjs')
+driver.get("http://en.wikipedia.org/wiki/Monty_Python")
+assert "Monty Python" in driver.title
+print("Monty Python was not in the title")
+driver.close()