-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathstatBytes-python.py
330 lines (226 loc) · 8.43 KB
/
statBytes-python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>
# <headingcell level=2>
# Getting started with Python for statistical computing
# <headingcell level=4>
# What is Python?
# <markdowncell>
# + A general purpose, high-level programming language
# + Design, syntax, and culture promote human readability
# + Interpreted
# + Dynamic typing
# + Automatic memory management
# + Support the object-oriented paradigm
# <headingcell level=4>
# The statistical computing "stack"
# <markdowncell>
# + *IPython* is an incredibly useful tool for interactive computing (like the R command line).
# + Typically, write code in a text editor, then use the *run* command in IPython.
# + Play with individual blocks of code interactively at the IPython prompt.
# + Not all of the following libraries are always necessary.
# + You don't have to import the whole package.
# <codecell>
import numpy as np ## vector and matrix operations
import scipy as sp ## grab-bag of statistical and science tools
import matplotlib.pyplot as plt ## matplotlib - plots
import pandas as pd ## emulates R data frames
import statsmodels.api as sm ## scikits.statsmodels - statistics library
import patsy ## emulates R model formulas
import sklearn as skl ## scikits.learn - machine learning library
from sklearn import mixture as sklmix
# <headingcell level=4>
# Installation
# <markdowncell>
# *Linux (Ubuntu)*
#
# + Python should be included in the OS. To install new packages use
# `(sudo) pip install [package]`.
#
# *Mac*
#
# + Python should be installed already. Consider a bundled installation to get all the packages easier.
#
# *Windows*
#
# + Check out the bundled scientific python distributions.
# + [Enthought](https://www.enthought.com/products/canopy/academic/) (free for academic use)
# + [Anaconda](https://store.continuum.io/) (Continuum Analytics)
# + [Python(x,y)](https://code.google.com/p/pythonxy/)
# <headingcell level=4>
# Generate some random variables
# <codecell>
x = np.linspace(0, 10, 100)
y = np.random.normal(loc=x, scale=1.0)
plt.plot(x, y, 'bo')
# <headingcell level=4>
# Basic linear regression
# <codecell>
ols_model = sm.OLS(y, x)
ols_fit = ols_model.fit()
yhat = ols_fit.predict()
print ols_fit.summary()
plt.plot(x, y, 'bo', x, yhat, 'r-', lw=2)
# <headingcell level=4>
# Open a CSV file with Pandas
# <codecell>
iris = pd.read_csv('iris.csv')
print iris, '\n'
print iris.head(10)
# <headingcell level=4>
# Compute summary statistics
# <codecell>
print np.round(iris.describe(), 2)
# <codecell>
type_grp = iris.groupby('Type')
print "number of groups:", type_grp.ngroups, "\n"
print "maximums:\n", type_grp.max(), "\n"
print "size of each group:\n", type_grp.size(), "\n"
# <headingcell level=4>
# Plot the data
# <codecell>
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(iris['PW'], iris['PL'], iris['SW'], c=iris['Type'])
ax.set_xlabel('PW')
ax.set_ylabel('PL')
ax.set_zlabel('SW')
ax.azim = 140
ax.elev = 15
# <headingcell level=4>
# Check the state of the namespace
# <codecell>
whos
# <headingcell level=4>
# Get help for a function
# <codecell>
# %pdoc sklmix.GMM
# %psource sklmix.GMM
#sklmix.GMM?
#sklmix.GMM??
# <headingcell level=4>
# Pick your favorite clustering algorithm
# <codecell>
gmm_model = sklmix.GMM(n_components=3, covariance_type='full')
gmm_model.fit(iris[['PW', 'PL', 'SW']])
yhat = gmm_model.predict(iris[['PW', 'PL', 'SW']])
crosstab = pd.crosstab(iris['Type'], yhat, rownames=['true'], colnames=['predicted'])
print crosstab
# <headingcell level=4>
# Align the confusion matrix with a non-standard package
# <codecell>
import munkres
import sys
m = munkres.Munkres()
cost = munkres.make_cost_matrix(crosstab.values.tolist(), lambda x : sys.maxint - x)
align = m.compute(cost)
print align, '\n'
permute = [x[1] for x in align]
new_label = np.argsort(permute)
yhat_new = new_label[yhat]
print pd.crosstab(iris['Type'], yhat_new, rownames=['true'], colnames=['predicted'])
# <headingcell level=4>
# Bridging the gap with Rpy2
# <codecell>
from rpy2.robjects import r
from rpy2.robjects.numpy2ri import numpy2ri as np2r
Xr = np2r(iris[['PW', 'PL', 'SW']].values)
d = r.dist(Xr)
tree = r.hclust(d, method='ward')
yhat_hclust = r.cutree(tree, k=3)
print pd.crosstab(iris['Type'], yhat_hclust, rownames=['true'], colnames=['predicted'])
# <headingcell level=4>
# Using non-base packages in Rpy2
# <codecell>
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
r = robjects.r
e1071 = importr('e1071')
Yr = np2r(iris['Type'])
Yr = r.factor(Yr)
svm = e1071.svm(Xr, Yr)
yhat = r.predict(svm, Xr)
print r.table(yhat, Yr)
# <headingcell level=4>
# ggplot2 in python with Rpy2
# <markdowncell>
# Thanks to [Fei Yu](http://www.thefeiyu.com/) for this vignette.
# <codecell>
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
r = robjects.r
r.library("ggplot2")
rnorm = r["rnorm"]
dtf = robjects.DataFrame({'x': rnorm(300, mean=0) + rnorm(100, mean=3),
'y': rnorm(300, mean=0) + rnorm(100, mean=3)})
robjects.globalenv['dtf'] = dtf # assign to R's environment
r("gp = ggplot(dtf, aes(x, y))")
r("pp = gp + geom_point(size=0.6)")
# r("plot(pp)") # I can't get this to work on my system, but saving the plot is just as good.
r("ggsave(filename='test.png', plot=pp, scale=0.3)")
from IPython.core.display import Image
Image(filename='test.png')
# <headingcell level=4>
# Python resources
# <markdowncell>
# + [Python tutorial](http://docs.python.org/2/tutorial/)
# + [Dive into Python](http://www.diveintopython.net/toc/index.html) (old but good)
# + [Ipython](http://ipython.org/)
# + [Ipython video](https://www.youtube.com/watch?v=2G5YTlheCbw)
# + [py2exe](http://www.py2exe.org/): compile python to a Windows executable file
# <headingcell level=4>
# Package websites
# <markdowncell>
# + [NumPy](http://www.numpy.org/)
# + [SciPy](http://www.scipy.org/)
# + [matplotlib](http://matplotlib.org/)
# + [pandas](http://pandas.pydata.org/)
# + [scikit-learn](http://scikit-learn.org/stable/)
# + [statsmodels](http://statsmodels.sourceforge.net/)
# + [patsy](http://patsy.readthedocs.org/en/latest/index.html): symbolic formulas for models
# <headingcell level=4>
# Rosetta stones
# <markdowncell>
# + [NumPy for MATLAB users, source 1](http://www.scipy.org/NumPy_for_Matlab_Users)
# + [NumPy for MATLAB users, source 2](http://mathesaurus.sourceforge.net/matlab-numpy.html)
# + [NumPy for R users](http://mathesaurus.sourceforge.net/r-numpy.html)
# + [MATLAB vs. R vs. Python](http://hyperpolyglot.org/numerical-analysis)
# <headingcell level=4>
# Miscellaneous
# <markdowncell>
# + [Fernando Perez](https://twitter.com/fperez_org) (IPython)
# + [Wes McKinney](https://twitter.com/wesmckinn) (pandas)
# + [PyData conference videos](http://pyvideo.org/category/18/pydata)
# + [SciPy 2012 conference videos](http://pyvideo.org/category/20/scipy_2012)
# + [Python for Data Analysis](http://www.amazon.com/books/dp/1449319793) (Wes McKinney, 2012)
# + [http://slendrmeans.wordpress.com/](http://slendrmeans.wordpress.com/)
# <headingcell level=4>
# Why Python?
# <markdowncell>
# + Somebody in power told you to.
# + Communication with other people.
# + Communication between parts of a data analysis pipeline.
# + Faster than R (often, but not always).
# + Simple syntax for object-oriented designs.
# + Written by computer scientists and programmers (not statisticians).
# + Much bigger ecosystem of general language tools (but smaller set of statistics tools).
# + Production-quality code.
# <headingcell level=4>
# Ipython Notebook
# <markdowncell>
# Relevant links:
#
# + http://ipython.org/notebook.html
# + https://github.com/ipython/nbconvert
# + http://nbviewer.ipython.org/
#
# For Ubuntu:
#
# + `sudo apt-get install ipython-notebook`
# + `ipython notebook --pylab=inline --script`
# + `nbconvert latex statBytes-python`
# + `pdflatex statBytes-python.tex`
#
# The `--pylab==inline` flag embeds matplotlib output in the notebook rather than popping up a new window for each figure. The `--script` flat saves a `.py` script along with the `.ipynb` file which is ready to be run in a regular python or ipython prompt.
#
# There are (at least) two ways to post a notebook online. For this notebook I pushed the .ipynb file to a public github repo, open the file on the github website, clicked the "raw" button, and pasted the URL into the box at the top of the nbviewer website. Pretty straightforward.