update class 7 code files

Jennytang1224 · Sep 9, 2015 · bde11cf · bde11cf
1 parent 310f0cf
commit bde11cf
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 2 deletions.
diff --git a/code/07_api.py b/code/07_api.py
@@ -14,30 +14,59 @@
 '''
 
 # read IMDb data into a DataFrame: we want a year column!
+import pandas as pd
+movies = pd.read_csv('imdb_1000.csv')
+movies.head()
 
 # use requests library to interact with a URL
+import requests
+r = requests.get('http://www.omdbapi.com/?t=the shawshank redemption&r=json&type=movie')
 
 # check the status: 200 means success, 4xx means error
+r.status_code
 
 # view the raw response text
+r.text
 
 # decode the JSON response body into a dictionary
+r.json()
 
 # extracting the year from the dictionary
+r.json()['Year']
 
 # what happens if the movie name is not recognized?
+r = requests.get('http://www.omdbapi.com/?t=blahblahblah&r=json&type=movie')
+r.status_code
+r.json()
 
 # define a function to return the year
+def get_movie_year(title):
+    r = requests.get('http://www.omdbapi.com/?t=' + title + '&r=json&type=movie')
+    info = r.json()
+    if info['Response'] == 'True':
+        return int(info['Year'])
+    else:
+        return None
 
 # test the function
+get_movie_year('The Shawshank Redemption')
+get_movie_year('blahblahblah')
 
 # create a smaller DataFrame for testing
+top_movies = movies.head().copy()
 
 # write a for loop to build a list of years
+from time import sleep
+years = []
+for title in top_movies.title:
+    years.append(get_movie_year(title))
+    sleep(1)
 
 # check that the DataFrame and the list of years are the same length
+assert(len(top_movies) == len(years))
 
 # save that list as a new column
+top_movies['year'] = years
 
 '''
 Bonus content: Updating the DataFrame as part of a loop

diff --git a/code/07_web_scraping.py b/code/07_web_scraping.py
@@ -73,38 +73,63 @@
 '''
 
 # find the 'h2' tag and then print its text
+b.find(name='h2').text
 
 # find the 'p' tag with an 'id' value of 'feedback' and then print its text
+b.find(name='p', attrs={'id':'feedback'}).text
 
 # find the first 'p' tag and then print the value of the 'id' attribute
+b.find(name='p')['id']
 
 # print the text of all four resources
+results = b.find_all(name='li')
+for tag in results:
+    print tag.text
 
 # print the text of only the API resources
+results = b.find(name='ul', attrs={'id':'api'}).find_all(name='li')
+for tag in results:
+    print tag.text
 
 '''
 Scraping the IMDb website
 '''
 
 # get the HTML from the Shawshank Redemption page
+import requests
+r = requests.get('http://www.imdb.com/title/tt0111161/')
 
 # convert HTML into Soup
+b = BeautifulSoup(r.text)
+print b
 
 # run this code if you have encoding errors
+import sys
+reload(sys)
+sys.setdefaultencoding('utf8')
 
 # get the title
+b.find_all(name='span', attrs={'class':'itemprop', 'itemprop':'name'})    # too many results
+b.find(name='span', attrs={'class':'itemprop', 'itemprop':'name'}).text   # just get the first
+b.find(name='h1').find(name='span', attrs={'class':'itemprop', 'itemprop':'name'}).text   # limit the search
 
-# get the star rating
+# get the star rating (as a float)
+float(b.find(name='span', attrs={'itemprop':'ratingValue'}).text)
+float(b.find(name='div', attrs={'class':'titlePageSprite star-box-giga-star'}).text)
 
 '''
 EXERCISE TWO
 '''
 
 # get the description
+b.find(name='p', attrs={'itemprop':'description'}).text.strip()
 
 # get the content rating
+b.find(name='meta', attrs={'itemprop':'contentRating'})['content']
 
 # get the duration in minutes (as an integer)
+int(b.find(name='time', attrs={'itemprop':'duration'}).text.strip()[:-4])
+int(b.find(name='time', attrs={'itemprop':'duration'})['datetime'][2:-1])
 
 '''
 OPTIONAL WEB SCRAPING HOMEWORK
@@ -113,7 +138,7 @@
 movie information: title, star_rating, description, content_rating, duration.
 The function should gather this information by scraping the IMDb website, not
 by calling the OMDb API. (This is really just a wrapper of the web scraping
-code we wrote above.)
+code we wrote above)
 
 For example, get_movie_info('tt0111161') should return:
 
@@ -131,6 +156,7 @@
 
 
 
+
 '''
 Another IMDb example: Getting the genres
 '''