Skip to content

Commit

Permalink
Initial commit of code from book
Browse files Browse the repository at this point in the history
  • Loading branch information
Ryan Mitchell authored and Ryan Mitchell committed Jun 15, 2015
1 parent e939535 commit 4260ca4
Show file tree
Hide file tree
Showing 98 changed files with 1,629 additions and 0 deletions.
Binary file added chapter1/.DS_Store
Binary file not shown.
3 changes: 3 additions & 0 deletions chapter1/1-basicExample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from urllib.request import urlopen
html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
print(html.read())
6 changes: 6 additions & 0 deletions chapter1/2-beautifulSoup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
bsObj = BeautifulSoup(html.read());
print(bsObj.h1)
26 changes: 26 additions & 0 deletions chapter1/3-exceptionHandling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import sys


def getTitle(url):
try:
html = urlopen(url)
except HTTPError as e:
print(e)
return None
try:
bsObj = BeautifulSoup(html.read())
title = bsObj.body.h1
except AttributeError as e:
return None
return title

title = getTitle("http://www.pythonscraping.com/exercises/exercise1.html")
if title == None:
print("Title could not be found")
else:
print(title)


8 changes: 8 additions & 0 deletions chapter10/1-seleniumBasic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from selenium import webdriver
import time

driver = webdriver.PhantomJS(executable_path='')
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
time.sleep(3)
print(driver.find_element_by_id("content").text)
driver.close()
11 changes: 11 additions & 0 deletions chapter10/2-waitForLoad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.PhantomJS(executable_path='')
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
try:
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "loadedButton")))
finally:
print(driver.find_element_by_id("content").text)
driver.close()
23 changes: 23 additions & 0 deletions chapter10/3-javascriptRedirect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from selenium import webdriver
import time
from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import StaleElementReferenceException

def waitForLoad(driver):
elem = driver.find_element_by_tag_name("html")
count = 0
while True:
count += 1
if count > 20:
print("Timing out after 10 seconds and returning")
return
time.sleep(.5)
try:
elem == driver.find_element_by_tag_name("html")
except StaleElementReferenceException:
return

driver = webdriver.PhantomJS(executable_path='<Path to Phantom JS>')
driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")
waitForLoad(driver)
print(driver.page_source)
6 changes: 6 additions & 0 deletions chapter11/1-basicImage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from PIL import Image, ImageFilter

kitten = Image.open("kitten.jpg")
blurryKitten = kitten.filter(ImageFilter.GaussianBlur)
blurryKitten.save("kitten_blurred.jpg")
blurryKitten.show()
19 changes: 19 additions & 0 deletions chapter11/2-cleanImage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from PIL import Image
import subprocess

def cleanFile(filePath, newFilePath):
image = Image.open(filePath)

#Set a threshold value for the image, and save
image = image.point(lambda x: 0 if x<143 else 255)
image.save(newFilePath)

#call tesseract to do OCR on the newly created image
subprocess.call(["tesseract", newFilePath, "output"])

#Open and read the resulting data file
outputFile = open("output.txt", 'r')
print(outputFile.read())
outputFile.close()

cleanFile("text_2.png", "text_2_clean.png")
36 changes: 36 additions & 0 deletions chapter11/3-readWebImages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import time
from urllib.request import urlretrieve
import subprocess
from selenium import webdriver

driver = webdriver.PhantomJS(executable_path='/Users/ryan/Documents/pythonscraping/code/headless/phantomjs-1.9.8-macosx/bin/phantomjs')
#driver = webdriver.Firefox()
driver.get("http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200")
time.sleep(2)

driver.find_element_by_id("sitbLogoImg").click()
#The easiest way to get exactly one of every page
imageList = set()

#Wait for the page to load
time.sleep(10)
print(driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style"))
while "pointer" in driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style"):
#While we can click on the right arrow, move through the pages
driver.find_element_by_id("sitbReaderRightPageTurner").click()
time.sleep(2)
#Get any new pages that have loaded (multiple pages can load at once)
pages = driver.find_elements_by_xpath("//div[@class='pageImage']/div/img")
for page in pages:
image = page.get_attribute("src")
imageList.add(image)

driver.quit()

#Start processing the images we've collected URLs for with Tesseract
for image in sorted(imageList):
urlretrieve(image, "page.jpg")
p = subprocess.Popen(["tesseract", "page.jpg", "page"], stdout=subprocess.PIPE,stderr=subprocess.PIPE)
p.wait()
f = open("page.txt", "r")
print(f.read())
48 changes: 48 additions & 0 deletions chapter11/4-solveCaptcha.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
import subprocess
import requests
from PIL import Image
from PIL import ImageOps

def cleanImage(imagePath):
image = Image.open(imagePath)
image = image.point(lambda x: 0 if x<143 else 255)
borderImage = ImageOps.expand(image,border=20,fill='white')
borderImage.save(imagePath)

html = urlopen("http://www.pythonscraping.com/humans-only")
bsObj = BeautifulSoup(html)
#Gather prepopulated form values
imageLocation = bsObj.find("img", {"title": "Image CAPTCHA"})["src"]
formBuildId = bsObj.find("input", {"name":"form_build_id"})["value"]
captchaSid = bsObj.find("input", {"name":"captcha_sid"})["value"]
captchaToken = bsObj.find("input", {"name":"captcha_token"})["value"]

captchaUrl = "http://pythonscraping.com"+imageLocation
urlretrieve(captchaUrl, "captcha.jpg")
cleanImage("captcha.jpg")
p = subprocess.Popen(["tesseract", "captcha.jpg", "captcha"], stdout=
subprocess.PIPE,stderr=subprocess.PIPE)
p.wait()
f = open("captcha.txt", "r")

#Clean any whitespace characters
captchaResponse = f.read().replace(" ", "").replace("\n", "")
print("Captcha solution attempt: "+captchaResponse)

if len(captchaResponse) == 5:
params = {"captcha_token":captchaToken, "captcha_sid":captchaSid,
"form_id":"comment_node_page_form", "form_build_id": formBuildId,
"captcha_response":captchaResponse, "name":"Ryan Mitchell",
"subject": "I come to seek the Grail",
"comment_body[und][0][value]":
"...and I am definitely not a bot"}
r = requests.post("http://www.pythonscraping.com/comment/reply/10",
data=params)
responseObj = BeautifulSoup(r.text)
if responseObj.find("div", {"class":"messages"}) is not None:
print(responseObj.find("div", {"class":"messages"}).get_text())
else:
print("There was a problem reading the CAPTCHA correctly!")
23 changes: 23 additions & 0 deletions chapter11/ghostdriver.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
PhantomJS is launching GhostDriver...
[INFO - 2015-06-15T00:08:45.592Z] GhostDriver - Main - running on port 51799
[INFO - 2015-06-15T00:08:46.231Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) PhantomJS/1.9.8 Safari/534.34","webSecurityEnabled":true}
[INFO - 2015-06-15T00:08:46.231Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.customHeaders: - {}
[INFO - 2015-06-15T00:08:46.231Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"1.9.8","driverName":"ghostdriver","driverVersion":"1.1.0","platform":"mac-10.9 (Mavericks)-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}}
[INFO - 2015-06-15T00:08:46.231Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9
[ERROR - 2015-06-15T00:08:47.864Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.onError - msg: ReferenceError: Can't find variable: ue
[ERROR - 2015-06-15T00:08:47.864Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.onError - stack:
(anonymous function) (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:10202)
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:940)
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:807)
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:696)
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:784)
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:959)
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:161)
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:70)
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:960)
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:2615)
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:2640)
(anonymous function) (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:10203)
[ERROR - 2015-06-15T00:08:47.865Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.onError - msg: TypeError: 'null' is not an object (evaluating 'old_error_handler.apply')
[ERROR - 2015-06-15T00:08:47.865Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.onError - stack:
dpOnErrorOverride (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:1186)
Binary file added chapter11/page.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
47 changes: 47 additions & 0 deletions chapter11/page.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
WEI‘ nrrd Peace
Len Nlkelayevldu Iolfluy

Readmg shmdd be ax
wlnvame asnossxble Wenfler
an mm m our cram: Llhvary

— Leo Tmsloy was a Russian rwovelwst
I and moval phflmopher med lur
A ms Ideas 01 nonviolenx reswslance m 5 We range 0, “and”

M.
known for ms genevosxly to the '°'”"“‘ ‘”' "*°’“‘* W‘"'

reading dwnmuie,—a|\
‘L °““"“‘ opnmizedfarreadabIh(y—so
readerscan tho the fnrmat

ms best knawn nuvckave “War and

Peace" (1869),w>v(h msmy regarded as an em ‘”“”L"‘5"‘°"‘ W‘ 599 ""19
vamer man a novel, and "Anna Kavemna" mm) “” '“""

Nxswark was admued m hiswne by Doxlayevxky,

Chxkoxa Turgenev, and Flauben and Water by Tm EaSyREad super

wgmiz wow and )ame§Joy<e Large 24 Eamon ‘S
"we: and Peace" 2 bnlhant prose epm by Tolstoy °'.“'m'Zed for readers
He pmsemsthe mm; M1892 axe mom msaae. "‘{"h ‘eVe'9‘Y "Ed“<9d
when m: Russwanswon agamsune Napo\zcn>c vlsxon
onshuqhtlhrmlghlhevadherenzr(owrme He
tamures the essenze af Ne wwlh an R5 mancnges,
hardsmps andmys The maepm poV\raya\ at
numemus thataclerx lrom an xpheves m We wands
u unparalleled reahsm Aime dassm‘



su+u27o3ro2ar1

||I||| H

cuvmumeu .21mna



14 changes: 14 additions & 0 deletions chapter12/1-headers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import requests
from bs4 import BeautifulSoup

session = requests.Session()
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)
AppleWebKit 537.36 (KHTML, like Gecko) Chrome",
"Accept":"text/html,application/xhtml+xml,application/xml;
q=0.9,image/webp,*/*;q=0.8"}
url = "https://www.whatismybrowser.com/
developers/what-http-headers-is-my-browser-sending"
req = session.get(url, headers=headers)

bsObj = BeautifulSoup(req.text)
print(bsObj.find("table",{"class":"table-striped"}).get_text)
18 changes: 18 additions & 0 deletions chapter12/2-seleniumCookies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from selenium import webdriver

driver = webdriver.PhantomJS(executable_path='<Path to Phantom JS>')
driver.get("http://pythonscraping.com")
driver.implicitly_wait(1)
print(driver.get_cookies())

savedCookies = driver.get_cookies()

driver2 = webdriver.PhantomJS(executable_path='<Path to Phantom JS>')
driver2.get("http://pythonscraping.com")
driver2.delete_all_cookies()
for cookie in savedCookies:
driver2.add_cookie(cookie)

driver2.get("http://pythonscraping.com")
driver.implicitly_wait(1)
print(driver2.get_cookies())
14 changes: 14 additions & 0 deletions chapter12/3-honeypotDetection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement

driver = webdriver.PhantomJS(executable_path='')
driver.get("http://pythonscraping.com/pages/itsatrap.html")
links = driver.find_elements_by_tag_name("a")
for link in links:
if not link.is_displayed():
print("The link "+link.get_attribute("href")+" is a trap")

fields = driver.find_elements_by_tag_name("input")
for field in fields:
if not field.is_displayed():
print("Do not change value of "+field.get_attribute("name"))
52 changes: 52 additions & 0 deletions chapter13/1-wikiUnitTest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from urllib.request import urlopen
from urllib.parse import unquote
import random
import re
from bs4 import BeautifulSoup
import unittest

class TestWikipedia(unittest.TestCase):

bsObj = None
url = None


def test_PageProperties(self):
global bsObj
global url

url = "http://en.wikipedia.org/wiki/Monty_Python"
#Test the first 100 pages we encounter
for i in range(1, 100):
bsObj = BeautifulSoup(urlopen(url))
titles = self.titleMatchesURL()
self.assertEquals(titles[0], titles[1])
self.assertTrue(self.contentExists())
url = self.getNextLink()
print("Done!")

def titleMatchesURL(self):
global bsObj
global url
pageTitle = bsObj.find("h1").get_text()
urlTitle = url[(url.index("/wiki/")+6):]
urlTitle = urlTitle.replace("_", " ")
urlTitle = unquote(urlTitle)
return [pageTitle.lower(), urlTitle.lower()]

def contentExists(self):
global bsObj
content = bsObj.find("div",{"id":"mw-content-text"})
if content is not None:
return True
return False

def getNextLink(self):
global bsObj
links = bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
link = links[random.randint(0, len(links)-1)].attrs['href']
print("Next link is: "+link)
return "http://en.wikipedia.org"+link

if __name__ == '__main__':
unittest.main()
8 changes: 8 additions & 0 deletions chapter13/2-wikiSeleniumTest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from selenium import webdriver


driver = webdriver.PhantomJS(executable_path='/Users/ryan/Documents/pythonscraping/code/headless/phantomjs-1.9.8-macosx/bin/phantomjs')
driver.get("http://en.wikipedia.org/wiki/Monty_Python")
assert "Monty Python" in driver.title
print("Monty Python was not in the title")
driver.close()
Loading

0 comments on commit 4260ca4

Please sign in to comment.