-
Notifications
You must be signed in to change notification settings - Fork 58
/
05-pandas.py
150 lines (112 loc) · 4.61 KB
/
05-pandas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
'''
Pandas Basics: Reading Files, Summarizing, Handling Missing Values, Filtering, Sorting
'''
# read in the CSV file from a URL
drinks = pd.read_csv('drinks.csv')
drinks = pd.read_csv('https://raw.githubusercontent.com/justmarkham/python-data-analysis-workshop/master/drinks.csv')
type(drinks)
# examine the data
drinks # print the first 30 and last 30 rows
drinks.head() # print the first 5 rows
drinks.tail() # print the last 5 rows
drinks.describe() # describe any numeric columns
drinks.info() # concise summary
# find missing values in a DataFrame
drinks.isnull() # DataFrame of booleans
drinks.isnull().sum() # convert booleans to integers and add
# handling missing values
drinks.dropna() # drop a row if ANY values are missing
drinks.fillna(value='NA') # fill in missing values
# fix the original import
drinks = pd.read_csv('https://raw.githubusercontent.com/justmarkham/python-data-analysis-workshop/master/drinks.csv', na_filter=False)
drinks.isnull().sum()
# selecting a column ('Series')
drinks['continent']
drinks.continent # equivalent
type(drinks.continent)
# summarizing a non-numeric column
drinks.continent.describe()
drinks.continent.value_counts()
# selecting multiple columns
drinks[['country', 'beer_servings']]
my_cols = ['country', 'beer_servings']
drinks[my_cols]
# add a new column as a function of existing columns
drinks['total_servings'] = drinks.beer_servings + drinks.spirit_servings + drinks.wine_servings
drinks.head()
# logical filtering and sorting
drinks[drinks.continent=='NA']
drinks[['country', 'total_servings']][drinks.continent=='NA']
drinks[['country', 'total_servings']][drinks.continent=='NA'].sort_index(by='total_servings')
drinks[drinks.wine_servings > 300]
drinks[drinks.wine_servings > drinks.beer_servings]
drinks.sort_index(by='beer_servings').tail()
drinks.beer_servings[drinks.continent=='NA'].mean()
drinks.beer_servings[drinks.continent=='EU'].mean()
'''
Split-Apply-Combine
'''
# for each continent, calculate mean beer servings
drinks.groupby('continent').beer_servings.mean()
# for each continent, count number of occurrences
drinks.groupby('continent').continent.count()
drinks.continent.value_counts()
# for each continent, calculate the min, max, and range for total servings
drinks.groupby('continent').total_servings.min()
drinks.groupby('continent').total_servings.max()
drinks.groupby('continent').total_servings.apply(lambda x: x.max() - x.min())
'''
Plotting
'''
# bar plot of number of countries in each continent
drinks.continent.value_counts().plot(kind='bar', title='Countries per Continent')
plt.xlabel('Continent')
plt.ylabel('Count')
plt.show()
# bar plot of average number of beer servings by continent
drinks.groupby('continent').beer_servings.mean().plot(kind='bar')
# histogram of beer servings
drinks.beer_servings.hist(bins=20)
# grouped histogram of beer servings
drinks.beer_servings.hist(by=drinks.continent, sharex=True)
# density plot of beer servings
drinks.beer_servings.plot(kind='density', xlim=(0,500))
# boxplot of beer servings by continent
drinks.boxplot(column='beer_servings', by='continent')
# scatterplot of beer servings versus wine servings
drinks.plot(x='beer_servings', y='wine_servings', kind='scatter', alpha=0.3)
# same scatterplot, except all European countries are colored red
colors = np.where(drinks.continent=='EU', 'r', 'b')
drinks.plot(x='beer_servings', y='wine_servings', kind='scatter', c=colors)
'''
Joining Data
'''
# read 'u.data' into 'ratings'
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_table('u.data', header=None, names=r_cols, sep='\t')
# read 'u.item' into 'movies'
m_cols = ['movie_id', 'title']
movies = pd.read_table('u.item', header=None, names=m_cols, sep='|', usecols=[0,1])
# merge 'movies' and 'ratings' (inner join on 'movie_id')
movies.head()
ratings.head()
movie_ratings = pd.merge(movies, ratings)
movie_ratings.head()
'''
Further Exploration
'''
# for each movie, count number of ratings
movie_ratings.title.value_counts()
# for each movie, calculate mean rating
movie_ratings.groupby('title').rating.mean().order(ascending=False)
# for each movie, count number of ratings and calculate mean rating
movie_ratings.groupby('title').rating.count()
movie_ratings.groupby('title').rating.mean()
movie_stats = movie_ratings.groupby('title').agg({'rating': [np.size, np.mean]})
movie_stats.head() # hierarchical index
# limit results to movies with more than 100 ratings
movie_stats[movie_stats.rating.size > 100].sort_index(by=('rating', 'mean'))