forked from REMitchell/python-scraping
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Ryan Mitchell
authored and
Ryan Mitchell
committed
Jun 15, 2015
1 parent
e939535
commit 4260ca4
Showing
98 changed files
with
1,629 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from urllib.request import urlopen | ||
html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html") | ||
print(html.read()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from urllib.request import urlopen | ||
from bs4 import BeautifulSoup | ||
|
||
html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html") | ||
bsObj = BeautifulSoup(html.read()); | ||
print(bsObj.h1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
from urllib.request import urlopen | ||
from urllib.error import HTTPError | ||
from bs4 import BeautifulSoup | ||
import sys | ||
|
||
|
||
def getTitle(url): | ||
try: | ||
html = urlopen(url) | ||
except HTTPError as e: | ||
print(e) | ||
return None | ||
try: | ||
bsObj = BeautifulSoup(html.read()) | ||
title = bsObj.body.h1 | ||
except AttributeError as e: | ||
return None | ||
return title | ||
|
||
title = getTitle("http://www.pythonscraping.com/exercises/exercise1.html") | ||
if title == None: | ||
print("Title could not be found") | ||
else: | ||
print(title) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from selenium import webdriver | ||
import time | ||
|
||
driver = webdriver.PhantomJS(executable_path='') | ||
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html") | ||
time.sleep(3) | ||
print(driver.find_element_by_id("content").text) | ||
driver.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.support.ui import WebDriverWait | ||
from selenium.webdriver.support import expected_conditions as EC | ||
|
||
driver = webdriver.PhantomJS(executable_path='') | ||
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html") | ||
try: | ||
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "loadedButton"))) | ||
finally: | ||
print(driver.find_element_by_id("content").text) | ||
driver.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from selenium import webdriver | ||
import time | ||
from selenium.webdriver.remote.webelement import WebElement | ||
from selenium.common.exceptions import StaleElementReferenceException | ||
|
||
def waitForLoad(driver): | ||
elem = driver.find_element_by_tag_name("html") | ||
count = 0 | ||
while True: | ||
count += 1 | ||
if count > 20: | ||
print("Timing out after 10 seconds and returning") | ||
return | ||
time.sleep(.5) | ||
try: | ||
elem == driver.find_element_by_tag_name("html") | ||
except StaleElementReferenceException: | ||
return | ||
|
||
driver = webdriver.PhantomJS(executable_path='<Path to Phantom JS>') | ||
driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html") | ||
waitForLoad(driver) | ||
print(driver.page_source) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from PIL import Image, ImageFilter | ||
|
||
kitten = Image.open("kitten.jpg") | ||
blurryKitten = kitten.filter(ImageFilter.GaussianBlur) | ||
blurryKitten.save("kitten_blurred.jpg") | ||
blurryKitten.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from PIL import Image | ||
import subprocess | ||
|
||
def cleanFile(filePath, newFilePath): | ||
image = Image.open(filePath) | ||
|
||
#Set a threshold value for the image, and save | ||
image = image.point(lambda x: 0 if x<143 else 255) | ||
image.save(newFilePath) | ||
|
||
#call tesseract to do OCR on the newly created image | ||
subprocess.call(["tesseract", newFilePath, "output"]) | ||
|
||
#Open and read the resulting data file | ||
outputFile = open("output.txt", 'r') | ||
print(outputFile.read()) | ||
outputFile.close() | ||
|
||
cleanFile("text_2.png", "text_2_clean.png") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import time | ||
from urllib.request import urlretrieve | ||
import subprocess | ||
from selenium import webdriver | ||
|
||
driver = webdriver.PhantomJS(executable_path='/Users/ryan/Documents/pythonscraping/code/headless/phantomjs-1.9.8-macosx/bin/phantomjs') | ||
#driver = webdriver.Firefox() | ||
driver.get("http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200") | ||
time.sleep(2) | ||
|
||
driver.find_element_by_id("sitbLogoImg").click() | ||
#The easiest way to get exactly one of every page | ||
imageList = set() | ||
|
||
#Wait for the page to load | ||
time.sleep(10) | ||
print(driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style")) | ||
while "pointer" in driver.find_element_by_id("sitbReaderRightPageTurner").get_attribute("style"): | ||
#While we can click on the right arrow, move through the pages | ||
driver.find_element_by_id("sitbReaderRightPageTurner").click() | ||
time.sleep(2) | ||
#Get any new pages that have loaded (multiple pages can load at once) | ||
pages = driver.find_elements_by_xpath("//div[@class='pageImage']/div/img") | ||
for page in pages: | ||
image = page.get_attribute("src") | ||
imageList.add(image) | ||
|
||
driver.quit() | ||
|
||
#Start processing the images we've collected URLs for with Tesseract | ||
for image in sorted(imageList): | ||
urlretrieve(image, "page.jpg") | ||
p = subprocess.Popen(["tesseract", "page.jpg", "page"], stdout=subprocess.PIPE,stderr=subprocess.PIPE) | ||
p.wait() | ||
f = open("page.txt", "r") | ||
print(f.read()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
from urllib.request import urlretrieve | ||
from urllib.request import urlopen | ||
from bs4 import BeautifulSoup | ||
import subprocess | ||
import requests | ||
from PIL import Image | ||
from PIL import ImageOps | ||
|
||
def cleanImage(imagePath): | ||
image = Image.open(imagePath) | ||
image = image.point(lambda x: 0 if x<143 else 255) | ||
borderImage = ImageOps.expand(image,border=20,fill='white') | ||
borderImage.save(imagePath) | ||
|
||
html = urlopen("http://www.pythonscraping.com/humans-only") | ||
bsObj = BeautifulSoup(html) | ||
#Gather prepopulated form values | ||
imageLocation = bsObj.find("img", {"title": "Image CAPTCHA"})["src"] | ||
formBuildId = bsObj.find("input", {"name":"form_build_id"})["value"] | ||
captchaSid = bsObj.find("input", {"name":"captcha_sid"})["value"] | ||
captchaToken = bsObj.find("input", {"name":"captcha_token"})["value"] | ||
|
||
captchaUrl = "http://pythonscraping.com"+imageLocation | ||
urlretrieve(captchaUrl, "captcha.jpg") | ||
cleanImage("captcha.jpg") | ||
p = subprocess.Popen(["tesseract", "captcha.jpg", "captcha"], stdout= | ||
subprocess.PIPE,stderr=subprocess.PIPE) | ||
p.wait() | ||
f = open("captcha.txt", "r") | ||
|
||
#Clean any whitespace characters | ||
captchaResponse = f.read().replace(" ", "").replace("\n", "") | ||
print("Captcha solution attempt: "+captchaResponse) | ||
|
||
if len(captchaResponse) == 5: | ||
params = {"captcha_token":captchaToken, "captcha_sid":captchaSid, | ||
"form_id":"comment_node_page_form", "form_build_id": formBuildId, | ||
"captcha_response":captchaResponse, "name":"Ryan Mitchell", | ||
"subject": "I come to seek the Grail", | ||
"comment_body[und][0][value]": | ||
"...and I am definitely not a bot"} | ||
r = requests.post("http://www.pythonscraping.com/comment/reply/10", | ||
data=params) | ||
responseObj = BeautifulSoup(r.text) | ||
if responseObj.find("div", {"class":"messages"}) is not None: | ||
print(responseObj.find("div", {"class":"messages"}).get_text()) | ||
else: | ||
print("There was a problem reading the CAPTCHA correctly!") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
PhantomJS is launching GhostDriver... | ||
[INFO - 2015-06-15T00:08:45.592Z] GhostDriver - Main - running on port 51799 | ||
[INFO - 2015-06-15T00:08:46.231Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) PhantomJS/1.9.8 Safari/534.34","webSecurityEnabled":true} | ||
[INFO - 2015-06-15T00:08:46.231Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.customHeaders: - {} | ||
[INFO - 2015-06-15T00:08:46.231Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"1.9.8","driverName":"ghostdriver","driverVersion":"1.1.0","platform":"mac-10.9 (Mavericks)-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}} | ||
[INFO - 2015-06-15T00:08:46.231Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9 | ||
[ERROR - 2015-06-15T00:08:47.864Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.onError - msg: ReferenceError: Can't find variable: ue | ||
[ERROR - 2015-06-15T00:08:47.864Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.onError - stack: | ||
(anonymous function) (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:10202) | ||
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:940) | ||
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:807) | ||
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:696) | ||
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:784) | ||
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:959) | ||
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:161) | ||
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:70) | ||
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:960) | ||
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:2615) | ||
(anonymous function) (http://z-ecx.images-amazon.com/images/G/01/browser-scripts/site-wide-js-1.6.4-beacon/site-wide-10223378515._V1_.js:2640) | ||
(anonymous function) (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:10203) | ||
[ERROR - 2015-06-15T00:08:47.865Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.onError - msg: TypeError: 'null' is not an object (evaluating 'old_error_handler.apply') | ||
[ERROR - 2015-06-15T00:08:47.865Z] Session [b0ccb720-12f2-11e5-b3d6-0ba38f30e2f9] - page.onError - stack: | ||
dpOnErrorOverride (http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200:1186) |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
WEI‘ nrrd Peace | ||
Len Nlkelayevldu Iolfluy | ||
|
||
Readmg shmdd be ax | ||
wlnvame asnossxble Wenfler | ||
an mm m our cram: Llhvary | ||
|
||
— Leo Tmsloy was a Russian rwovelwst | ||
I and moval phflmopher med lur | ||
A ms Ideas 01 nonviolenx reswslance m 5 We range 0, “and” | ||
|
||
M. | ||
known for ms genevosxly to the '°'”"“‘ ‘”' "*°’“‘* W‘"' | ||
|
||
reading dwnmuie,—a|\ | ||
‘L °““"“‘ opnmizedfarreadabIh(y—so | ||
readerscan tho the fnrmat | ||
|
||
ms best knawn nuvckave “War and | ||
|
||
Peace" (1869),w>v(h msmy regarded as an em ‘”“”L"‘5"‘°"‘ W‘ 599 ""19 | ||
vamer man a novel, and "Anna Kavemna" mm) “” '“"" | ||
|
||
Nxswark was admued m hiswne by Doxlayevxky, | ||
|
||
Chxkoxa Turgenev, and Flauben and Water by Tm EaSyREad super | ||
|
||
wgmiz wow and )ame§Joy<e Large 24 Eamon ‘S | ||
"we: and Peace" 2 bnlhant prose epm by Tolstoy °'.“'m'Zed for readers | ||
He pmsemsthe mm; M1892 axe mom msaae. "‘{"h ‘eVe'9‘Y "Ed“<9d | ||
when m: Russwanswon agamsune Napo\zcn>c vlsxon | ||
onshuqhtlhrmlghlhevadherenzr(owrme He | ||
tamures the essenze af Ne wwlh an R5 mancnges, | ||
hardsmps andmys The maepm poV\raya\ at | ||
numemus thataclerx lrom an xpheves m We wands | ||
u unparalleled reahsm Aime dassm‘ | ||
|
||
|
||
|
||
su+u27o3ro2ar1 | ||
|
||
||I||| H | ||
|
||
cuvmumeu .21mna | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
import requests | ||
from bs4 import BeautifulSoup | ||
|
||
session = requests.Session() | ||
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) | ||
AppleWebKit 537.36 (KHTML, like Gecko) Chrome", | ||
"Accept":"text/html,application/xhtml+xml,application/xml; | ||
q=0.9,image/webp,*/*;q=0.8"} | ||
url = "https://www.whatismybrowser.com/ | ||
developers/what-http-headers-is-my-browser-sending" | ||
req = session.get(url, headers=headers) | ||
|
||
bsObj = BeautifulSoup(req.text) | ||
print(bsObj.find("table",{"class":"table-striped"}).get_text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from selenium import webdriver | ||
|
||
driver = webdriver.PhantomJS(executable_path='<Path to Phantom JS>') | ||
driver.get("http://pythonscraping.com") | ||
driver.implicitly_wait(1) | ||
print(driver.get_cookies()) | ||
|
||
savedCookies = driver.get_cookies() | ||
|
||
driver2 = webdriver.PhantomJS(executable_path='<Path to Phantom JS>') | ||
driver2.get("http://pythonscraping.com") | ||
driver2.delete_all_cookies() | ||
for cookie in savedCookies: | ||
driver2.add_cookie(cookie) | ||
|
||
driver2.get("http://pythonscraping.com") | ||
driver.implicitly_wait(1) | ||
print(driver2.get_cookies()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from selenium import webdriver | ||
from selenium.webdriver.remote.webelement import WebElement | ||
|
||
driver = webdriver.PhantomJS(executable_path='') | ||
driver.get("http://pythonscraping.com/pages/itsatrap.html") | ||
links = driver.find_elements_by_tag_name("a") | ||
for link in links: | ||
if not link.is_displayed(): | ||
print("The link "+link.get_attribute("href")+" is a trap") | ||
|
||
fields = driver.find_elements_by_tag_name("input") | ||
for field in fields: | ||
if not field.is_displayed(): | ||
print("Do not change value of "+field.get_attribute("name")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
from urllib.request import urlopen | ||
from urllib.parse import unquote | ||
import random | ||
import re | ||
from bs4 import BeautifulSoup | ||
import unittest | ||
|
||
class TestWikipedia(unittest.TestCase): | ||
|
||
bsObj = None | ||
url = None | ||
|
||
|
||
def test_PageProperties(self): | ||
global bsObj | ||
global url | ||
|
||
url = "http://en.wikipedia.org/wiki/Monty_Python" | ||
#Test the first 100 pages we encounter | ||
for i in range(1, 100): | ||
bsObj = BeautifulSoup(urlopen(url)) | ||
titles = self.titleMatchesURL() | ||
self.assertEquals(titles[0], titles[1]) | ||
self.assertTrue(self.contentExists()) | ||
url = self.getNextLink() | ||
print("Done!") | ||
|
||
def titleMatchesURL(self): | ||
global bsObj | ||
global url | ||
pageTitle = bsObj.find("h1").get_text() | ||
urlTitle = url[(url.index("/wiki/")+6):] | ||
urlTitle = urlTitle.replace("_", " ") | ||
urlTitle = unquote(urlTitle) | ||
return [pageTitle.lower(), urlTitle.lower()] | ||
|
||
def contentExists(self): | ||
global bsObj | ||
content = bsObj.find("div",{"id":"mw-content-text"}) | ||
if content is not None: | ||
return True | ||
return False | ||
|
||
def getNextLink(self): | ||
global bsObj | ||
links = bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")) | ||
link = links[random.randint(0, len(links)-1)].attrs['href'] | ||
print("Next link is: "+link) | ||
return "http://en.wikipedia.org"+link | ||
|
||
if __name__ == '__main__': | ||
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from selenium import webdriver | ||
|
||
|
||
driver = webdriver.PhantomJS(executable_path='/Users/ryan/Documents/pythonscraping/code/headless/phantomjs-1.9.8-macosx/bin/phantomjs') | ||
driver.get("http://en.wikipedia.org/wiki/Monty_Python") | ||
assert "Monty Python" in driver.title | ||
print("Monty Python was not in the title") | ||
driver.close() |
Oops, something went wrong.