add class 5 and 6 homework solutions

ajchan11 · Sep 8, 2015 · 2015af5 · 2015af5
1 parent 259c830
commit 2015af5
Show file tree

Hide file tree

Showing 3 changed files with 192 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -203,8 +203,8 @@ Tuesday | Thursday
 -----
 
 ### Class 7: Getting Data
-* Pandas homework with the IMDb data due (solution)
-* Optional "human learning" exercise with the iris data due (solution)
+* Pandas homework with the IMDb data due ([solution](code/05_pandas_homework_imdb.py))
+* Optional "human learning" exercise with the iris data due ([solution](http://nbviewer.ipython.org/github/justmarkham/DAT8/blob/master/notebooks/06_human_learning_iris.ipynb))
 * APIs ([code](code/07_api.py))
     * [OMDb API](http://www.omdbapi.com/)
 * Web scraping ([code](code/07_web_scraping.py))

diff --git a/code/05_pandas_homework_imdb.py b/code/05_pandas_homework_imdb.py
@@ -10,54 +10,98 @@
 import matplotlib.pyplot as plt
 
 # read in 'imdb_1000.csv' and store it in a DataFrame named movies
+movies = pd.read_csv('imdb_1000.csv')
 
 # check the number of rows and columns
+movies.shape
 
 # check the data type of each column
+movies.dtypes
 
 # calculate the average movie duration
+movies.duration.mean()
 
 # sort the DataFrame by duration to find the shortest and longest movies
+movies.sort('duration').head(1)
+movies.sort('duration').tail(1)
 
 # create a histogram of duration, choosing an "appropriate" number of bins
+movies.duration.plot(kind='hist', bins=20)
 
 # use a box plot to display that same data
+movies.duration.plot(kind='box')
 
 '''
 INTERMEDIATE LEVEL
 '''
 
 # count how many movies have each of the content ratings
+movies.content_rating.value_counts()
 
 # use a visualization to display that same data, including a title and x and y labels
+movies.content_rating.value_counts().plot(kind='bar', title='Top 1000 Movies by Content Rating')
+plt.xlabel('Content Rating')
+plt.ylabel('Number of Movies')
 
 # convert the following content ratings to "UNRATED": NOT RATED, APPROVED, PASSED, GP
+movies.content_rating.replace(['NOT RATED', 'APPROVED', 'PASSED', 'GP'], 'UNRATED', inplace=True)
 
 # convert the following content ratings to "NC-17": X, TV-MA
+movies.content_rating.replace(['X', 'TV-MA'], 'NC-17', inplace=True)
 
 # count the number of missing values in each column
+movies.isnull().sum()
 
 # if there are missing values: examine them, then fill them in with "reasonable" values
+movies[movies.content_rating.isnull()]
+movies.content_rating.fillna('UNRATED', inplace=True)
 
 # calculate the average star rating for movies 2 hours or longer,
 # and compare that with the average star rating for movies shorter than 2 hours
+movies[movies.duration >= 120].star_rating.mean()
+movies[movies.duration < 120].star_rating.mean()
 
 # use a visualization to detect whether there is a relationship between duration and star rating
+movies.plot(kind='scatter', x='duration', y='star_rating', alpha=0.2)
 
 # calculate the average duration for each genre
+movies.groupby('genre').duration.mean()
 
 '''
 ADVANCED LEVEL
 '''
 
 # visualize the relationship between content rating and duration
+movies.boxplot(column='duration', by='content_rating')
+movies.hist(column='duration', by='content_rating', sharex=True)
 
 # determine the top rated movie (by star rating) for each genre
+movies.sort('star_rating', ascending=False).groupby('genre').title.first()
+movies.groupby('genre').title.first()   # equivalent, since DataFrame is already sorted by star rating
 
 # check if there are multiple movies with the same title, and if so, determine if they are actually duplicates
+dupe_titles = movies[movies.title.duplicated()].title
+movies[movies.title.isin(dupe_titles)]
 
 # calculate the average star rating for each genre, but only include genres with at least 10 movies
 
+# option 1: manually create a list of relevant genres, then filter using that list
+movies.genre.value_counts()
+top_genres = ['Drama', 'Comedy', 'Action', 'Crime', 'Biography', 'Adventure', 'Animation', 'Horror', 'Mystery']
+movies[movies.genre.isin(top_genres)].groupby('genre').star_rating.mean()
+
+# option 2: automatically create a list of relevant genres by saving the value_counts and then filtering
+genre_counts = movies.genre.value_counts()
+top_genres = genre_counts[genre_counts >= 10].index
+movies[movies.genre.isin(top_genres)].groupby('genre').star_rating.mean()
+
+# option 3: calculate the average star rating for all genres, then filter using a boolean Series
+movies.groupby('genre').star_rating.mean()[movies.genre.value_counts() >= 10]
+
+# option 4: aggregate by count and mean, then filter using the count
+genre_ratings = movies.groupby('genre').star_rating.agg(['count', 'mean'])
+genre_ratings[genre_ratings['count'] >= 10]
+
 '''
 BONUS
 '''

diff --git a/notebooks/06_human_learning_iris.ipynb b/notebooks/06_human_learning_iris.ipynb
@@ -1506,6 +1506,152 @@
     "\n",
     "Define a function that accepts a row of data and returns a predicted species. Then, use that function to make predictions for all existing rows of data, and check the accuracy of your predictions."
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# given a row of data, return a predicted species_num (0/1/2)\n",
+    "def classify_iris(row):\n",
+    "\n",
+    "    # calculate the petal_area\n",
+    "    petal_area = row[2] * row[3]\n",
+    "    \n",
+    "    # predict the species based on the rules above    \n",
+    "    if petal_area < 2:\n",
+    "        prediction = 'setosa'\n",
+    "    elif petal_area < 7.4:\n",
+    "        prediction = 'versicolor'\n",
+    "    else:\n",
+    "        prediction = 'virginica'\n",
+    "    \n",
+    "    # map the species name to a numeric value\n",
+    "    species_to_num = {'setosa':0, 'versicolor':1, 'virginica':2}\n",
+    "    \n",
+    "    # return that value\n",
+    "    return species_to_num[prediction]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "sepal_length            5.1\n",
+       "sepal_width             3.5\n",
+       "petal_length            1.4\n",
+       "petal_width             0.2\n",
+       "species         Iris-setosa\n",
+       "species_num               0\n",
+       "petal_area             0.28\n",
+       "Name: 0, dtype: object"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# print the first row\n",
+    "iris.iloc[0, :]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "sepal_length               5.9\n",
+       "sepal_width                  3\n",
+       "petal_length               5.1\n",
+       "petal_width                1.8\n",
+       "species         Iris-virginica\n",
+       "species_num                  2\n",
+       "petal_area                9.18\n",
+       "Name: 149, dtype: object"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# print the last row\n",
+    "iris.iloc[149, :]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "2\n"
+     ]
+    }
+   ],
+   "source": [
+    "# test the function on the first and last rows\n",
+    "print classify_iris(iris.iloc[0, :])\n",
+    "print classify_iris(iris.iloc[149, :])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# make predictions for all rows and store them in the DataFrame\n",
+    "iris['prediction'] = [classify_iris(row) for index, row in iris.iterrows()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.97333333333333338"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# calculate the percentage of correct predictions\n",
+    "sum(iris.species_num == iris.prediction) / 150."
+   ]
   }
  ],
  "metadata": {