Skip to content

Commit

Permalink
add class 5 and 6 homework solutions
Browse files Browse the repository at this point in the history
  • Loading branch information
justmarkham committed Sep 8, 2015
1 parent 259c830 commit 2015af5
Show file tree
Hide file tree
Showing 3 changed files with 192 additions and 2 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,8 @@ Tuesday | Thursday
-----

### Class 7: Getting Data
* Pandas homework with the IMDb data due (solution)
* Optional "human learning" exercise with the iris data due (solution)
* Pandas homework with the IMDb data due ([solution](code/05_pandas_homework_imdb.py))
* Optional "human learning" exercise with the iris data due ([solution](http://nbviewer.ipython.org/github/justmarkham/DAT8/blob/master/notebooks/06_human_learning_iris.ipynb))
* APIs ([code](code/07_api.py))
* [OMDb API](http://www.omdbapi.com/)
* Web scraping ([code](code/07_web_scraping.py))
Expand Down
44 changes: 44 additions & 0 deletions code/05_pandas_homework_imdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,54 +10,98 @@
import matplotlib.pyplot as plt

# read in 'imdb_1000.csv' and store it in a DataFrame named movies
movies = pd.read_csv('imdb_1000.csv')

# check the number of rows and columns
movies.shape

# check the data type of each column
movies.dtypes

# calculate the average movie duration
movies.duration.mean()

# sort the DataFrame by duration to find the shortest and longest movies
movies.sort('duration').head(1)
movies.sort('duration').tail(1)

# create a histogram of duration, choosing an "appropriate" number of bins
movies.duration.plot(kind='hist', bins=20)

# use a box plot to display that same data
movies.duration.plot(kind='box')

'''
INTERMEDIATE LEVEL
'''

# count how many movies have each of the content ratings
movies.content_rating.value_counts()

# use a visualization to display that same data, including a title and x and y labels
movies.content_rating.value_counts().plot(kind='bar', title='Top 1000 Movies by Content Rating')
plt.xlabel('Content Rating')
plt.ylabel('Number of Movies')

# convert the following content ratings to "UNRATED": NOT RATED, APPROVED, PASSED, GP
movies.content_rating.replace(['NOT RATED', 'APPROVED', 'PASSED', 'GP'], 'UNRATED', inplace=True)

# convert the following content ratings to "NC-17": X, TV-MA
movies.content_rating.replace(['X', 'TV-MA'], 'NC-17', inplace=True)

# count the number of missing values in each column
movies.isnull().sum()

# if there are missing values: examine them, then fill them in with "reasonable" values
movies[movies.content_rating.isnull()]
movies.content_rating.fillna('UNRATED', inplace=True)

# calculate the average star rating for movies 2 hours or longer,
# and compare that with the average star rating for movies shorter than 2 hours
movies[movies.duration >= 120].star_rating.mean()
movies[movies.duration < 120].star_rating.mean()

# use a visualization to detect whether there is a relationship between duration and star rating
movies.plot(kind='scatter', x='duration', y='star_rating', alpha=0.2)

# calculate the average duration for each genre
movies.groupby('genre').duration.mean()

'''
ADVANCED LEVEL
'''

# visualize the relationship between content rating and duration
movies.boxplot(column='duration', by='content_rating')
movies.hist(column='duration', by='content_rating', sharex=True)

# determine the top rated movie (by star rating) for each genre
movies.sort('star_rating', ascending=False).groupby('genre').title.first()
movies.groupby('genre').title.first() # equivalent, since DataFrame is already sorted by star rating

# check if there are multiple movies with the same title, and if so, determine if they are actually duplicates
dupe_titles = movies[movies.title.duplicated()].title
movies[movies.title.isin(dupe_titles)]

# calculate the average star rating for each genre, but only include genres with at least 10 movies

# option 1: manually create a list of relevant genres, then filter using that list
movies.genre.value_counts()
top_genres = ['Drama', 'Comedy', 'Action', 'Crime', 'Biography', 'Adventure', 'Animation', 'Horror', 'Mystery']
movies[movies.genre.isin(top_genres)].groupby('genre').star_rating.mean()

# option 2: automatically create a list of relevant genres by saving the value_counts and then filtering
genre_counts = movies.genre.value_counts()
top_genres = genre_counts[genre_counts >= 10].index
movies[movies.genre.isin(top_genres)].groupby('genre').star_rating.mean()

# option 3: calculate the average star rating for all genres, then filter using a boolean Series
movies.groupby('genre').star_rating.mean()[movies.genre.value_counts() >= 10]

# option 4: aggregate by count and mean, then filter using the count
genre_ratings = movies.groupby('genre').star_rating.agg(['count', 'mean'])
genre_ratings[genre_ratings['count'] >= 10]

'''
BONUS
'''
Expand Down
146 changes: 146 additions & 0 deletions notebooks/06_human_learning_iris.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1506,6 +1506,152 @@
"\n",
"Define a function that accepts a row of data and returns a predicted species. Then, use that function to make predictions for all existing rows of data, and check the accuracy of your predictions."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# given a row of data, return a predicted species_num (0/1/2)\n",
"def classify_iris(row):\n",
"\n",
" # calculate the petal_area\n",
" petal_area = row[2] * row[3]\n",
" \n",
" # predict the species based on the rules above \n",
" if petal_area < 2:\n",
" prediction = 'setosa'\n",
" elif petal_area < 7.4:\n",
" prediction = 'versicolor'\n",
" else:\n",
" prediction = 'virginica'\n",
" \n",
" # map the species name to a numeric value\n",
" species_to_num = {'setosa':0, 'versicolor':1, 'virginica':2}\n",
" \n",
" # return that value\n",
" return species_to_num[prediction]"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"sepal_length 5.1\n",
"sepal_width 3.5\n",
"petal_length 1.4\n",
"petal_width 0.2\n",
"species Iris-setosa\n",
"species_num 0\n",
"petal_area 0.28\n",
"Name: 0, dtype: object"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# print the first row\n",
"iris.iloc[0, :]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"sepal_length 5.9\n",
"sepal_width 3\n",
"petal_length 5.1\n",
"petal_width 1.8\n",
"species Iris-virginica\n",
"species_num 2\n",
"petal_area 9.18\n",
"Name: 149, dtype: object"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# print the last row\n",
"iris.iloc[149, :]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"2\n"
]
}
],
"source": [
"# test the function on the first and last rows\n",
"print classify_iris(iris.iloc[0, :])\n",
"print classify_iris(iris.iloc[149, :])"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# make predictions for all rows and store them in the DataFrame\n",
"iris['prediction'] = [classify_iris(row) for index, row in iris.iterrows()]"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.97333333333333338"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# calculate the percentage of correct predictions\n",
"sum(iris.species_num == iris.prediction) / 150."
]
}
],
"metadata": {
Expand Down

0 comments on commit 2015af5

Please sign in to comment.