Skip to content

Commit

Permalink
update class 7 code files
Browse files Browse the repository at this point in the history
  • Loading branch information
justmarkham committed Sep 9, 2015
1 parent 310f0cf commit bde11cf
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 2 deletions.
29 changes: 29 additions & 0 deletions code/07_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,30 +14,59 @@
'''

# read IMDb data into a DataFrame: we want a year column!
import pandas as pd
movies = pd.read_csv('imdb_1000.csv')
movies.head()

# use requests library to interact with a URL
import requests
r = requests.get('http://www.omdbapi.com/?t=the shawshank redemption&r=json&type=movie')

# check the status: 200 means success, 4xx means error
r.status_code

# view the raw response text
r.text

# decode the JSON response body into a dictionary
r.json()

# extracting the year from the dictionary
r.json()['Year']

# what happens if the movie name is not recognized?
r = requests.get('http://www.omdbapi.com/?t=blahblahblah&r=json&type=movie')
r.status_code
r.json()

# define a function to return the year
def get_movie_year(title):
r = requests.get('http://www.omdbapi.com/?t=' + title + '&r=json&type=movie')
info = r.json()
if info['Response'] == 'True':
return int(info['Year'])
else:
return None

# test the function
get_movie_year('The Shawshank Redemption')
get_movie_year('blahblahblah')

# create a smaller DataFrame for testing
top_movies = movies.head().copy()

# write a for loop to build a list of years
from time import sleep
years = []
for title in top_movies.title:
years.append(get_movie_year(title))
sleep(1)

# check that the DataFrame and the list of years are the same length
assert(len(top_movies) == len(years))

# save that list as a new column
top_movies['year'] = years

'''
Bonus content: Updating the DataFrame as part of a loop
Expand Down
30 changes: 28 additions & 2 deletions code/07_web_scraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,38 +73,63 @@
'''

# find the 'h2' tag and then print its text
b.find(name='h2').text

# find the 'p' tag with an 'id' value of 'feedback' and then print its text
b.find(name='p', attrs={'id':'feedback'}).text

# find the first 'p' tag and then print the value of the 'id' attribute
b.find(name='p')['id']

# print the text of all four resources
results = b.find_all(name='li')
for tag in results:
print tag.text

# print the text of only the API resources
results = b.find(name='ul', attrs={'id':'api'}).find_all(name='li')
for tag in results:
print tag.text

'''
Scraping the IMDb website
'''

# get the HTML from the Shawshank Redemption page
import requests
r = requests.get('http://www.imdb.com/title/tt0111161/')

# convert HTML into Soup
b = BeautifulSoup(r.text)
print b

# run this code if you have encoding errors
import sys
reload(sys)
sys.setdefaultencoding('utf8')

# get the title
b.find_all(name='span', attrs={'class':'itemprop', 'itemprop':'name'}) # too many results
b.find(name='span', attrs={'class':'itemprop', 'itemprop':'name'}).text # just get the first
b.find(name='h1').find(name='span', attrs={'class':'itemprop', 'itemprop':'name'}).text # limit the search

# get the star rating
# get the star rating (as a float)
float(b.find(name='span', attrs={'itemprop':'ratingValue'}).text)
float(b.find(name='div', attrs={'class':'titlePageSprite star-box-giga-star'}).text)

'''
EXERCISE TWO
'''

# get the description
b.find(name='p', attrs={'itemprop':'description'}).text.strip()

# get the content rating
b.find(name='meta', attrs={'itemprop':'contentRating'})['content']

# get the duration in minutes (as an integer)
int(b.find(name='time', attrs={'itemprop':'duration'}).text.strip()[:-4])
int(b.find(name='time', attrs={'itemprop':'duration'})['datetime'][2:-1])

'''
OPTIONAL WEB SCRAPING HOMEWORK
Expand All @@ -113,7 +138,7 @@
movie information: title, star_rating, description, content_rating, duration.
The function should gather this information by scraping the IMDb website, not
by calling the OMDb API. (This is really just a wrapper of the web scraping
code we wrote above.)
code we wrote above)
For example, get_movie_info('tt0111161') should return:
Expand All @@ -131,6 +156,7 @@




'''
Another IMDb example: Getting the genres
'''
Expand Down

0 comments on commit bde11cf

Please sign in to comment.