Skip to content

Commit 4205208

Browse files
committedMar 25, 2015
ENH Single file with the code as in book
1 parent 1d8fd23 commit 4205208

File tree

2 files changed

+170
-0
lines changed

2 files changed

+170
-0
lines changed
 

‎ch02/README.rst

+3
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ Support code for *Chapter 2: Learning How to Classify with Real-world
66
Examples*. The directory data contains the seeds dataset, originally downloaded
77
from https://archive.ics.uci.edu/ml/datasets/seeds
88

9+
chapter.py
10+
The code as printed in the book.
11+
912
figure1.py
1013
Figure 1 in the book: all 2-by-2 scatter plots
1114

‎ch02/chapter.py

+167
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
# This code is supporting material for the book
2+
# Building Machine Learning Systems with Python
3+
# by Willi Richert and Luis Pedro Coelho
4+
# published by PACKT Publishing
5+
#
6+
# It is made available under the MIT License
7+
8+
9+
from matplotlib import pyplot as plt
10+
import numpy as np
11+
12+
# We load the data with load_iris from sklearn
13+
from sklearn.datasets import load_iris
14+
data = load_iris()
15+
16+
# load_iris returns an object with several fields
17+
features = data.data
18+
feature_names = data.feature_names
19+
target = data.target
20+
target_names = data.target_names
21+
22+
for t in range(3):
23+
if t == 0:
24+
c = 'r'
25+
marker = '>'
26+
elif t == 1:
27+
c = 'g'
28+
marker = 'o'
29+
elif t == 2:
30+
c = 'b'
31+
marker = 'x'
32+
plt.scatter(features[target == t,0],
33+
features[target == t,1],
34+
marker=marker,
35+
c=c)
36+
# We use NumPy fancy indexing to get an array of strings:
37+
labels = target_names[target]
38+
39+
# The petal length is the feature at position 2
40+
plength = features[:, 2]
41+
42+
# Build an array of booleans:
43+
is_setosa = (labels == 'setosa')
44+
45+
# This is the important step:
46+
max_setosa =plength[is_setosa].max()
47+
min_non_setosa = plength[~is_setosa].min()
48+
print('Maximum of setosa: {0}.'.format(max_setosa))
49+
50+
print('Minimum of others: {0}.'.format(min_non_setosa))
51+
52+
# ~ is the boolean negation operator
53+
features = features[~is_setosa]
54+
labels = labels[~is_setosa]
55+
# Build a new target variable, is_virigina
56+
is_virginica = (labels == 'virginica')
57+
58+
# Initialize best_acc to impossibly low value
59+
best_acc = -1.0
60+
for fi in range(features.shape[1]):
61+
# We are going to test all possible thresholds
62+
thresh = features[:,fi]
63+
for t in thresh:
64+
65+
# Get the vector for feature `fi`
66+
feature_i = features[:, fi]
67+
# apply threshold `t`
68+
pred = (feature_i > t)
69+
acc = (pred == is_virginica).mean()
70+
rev_acc = (pred == ~is_virginica).mean()
71+
if rev_acc > acc:
72+
reverse = True
73+
acc = rev_acc
74+
else:
75+
reverse = False
76+
77+
if acc > best_acc:
78+
best_acc = acc
79+
best_fi = fi
80+
best_t = t
81+
best_reverse = reverse
82+
83+
print(best_fi, best_t, best_reverse, best_acc)
84+
85+
def is_virginica_test(fi, t, reverse, example):
86+
'Apply threshold model to a new example'
87+
test = example[fi] > t
88+
if reverse:
89+
test = not test
90+
return test
91+
from threshold import fit_model, predict
92+
93+
# ning accuracy was 96.0%.
94+
# ing accuracy was 90.0% (N = 50).
95+
correct = 0.0
96+
97+
for ei in range(len(features)):
98+
# select all but the one at position `ei`:
99+
training = np.ones(len(features), bool)
100+
training[ei] = False
101+
testing = ~training
102+
model = fit_model(features[training], is_virginica[training])
103+
predict(model, features[testing])
104+
predictions = predict(model, features[testing])
105+
correct += np.sum(predictions == is_virginica[testing])
106+
acc = correct/float(len(features))
107+
print('Accuracy: {0:.1%}'.format(acc))
108+
109+
110+
###########################################
111+
############## SEEDS DATASET ##############
112+
###########################################
113+
114+
from load import load_dataset
115+
116+
feature_names = [
117+
'area',
118+
'perimeter',
119+
'compactness',
120+
'length of kernel',
121+
'width of kernel',
122+
'asymmetry coefficien',
123+
'length of kernel groove',
124+
]
125+
features, labels = load_dataset('seeds')
126+
127+
128+
129+
from sklearn.neighbors import KNeighborsClassifier
130+
classifier = KNeighborsClassifier(n_neighbors=1)
131+
from sklearn.cross_validation import KFold
132+
133+
kf = KFold(len(features), n_folds=5, shuffle=True)
134+
means = []
135+
for training,testing in kf:
136+
# We learn a model for this fold with `fit` and then apply it to the
137+
# testing data with `predict`:
138+
classifier.fit(features[training], labels[training])
139+
prediction = classifier.predict(features[testing])
140+
141+
# np.mean on an array of booleans returns fraction
142+
# of correct decisions for this fold:
143+
curmean = np.mean(prediction == labels[testing])
144+
means.append(curmean)
145+
print('Mean accuracy: {:.1%}'.format(np.mean(means)))
146+
147+
148+
from sklearn.pipeline import Pipeline
149+
from sklearn.preprocessing import StandardScaler
150+
151+
classifier = KNeighborsClassifier(n_neighbors=1)
152+
classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)])
153+
154+
155+
156+
means = []
157+
for training,testing in kf:
158+
# We learn a model for this fold with `fit` and then apply it to the
159+
# testing data with `predict`:
160+
classifier.fit(features[training], labels[training])
161+
prediction = classifier.predict(features[testing])
162+
163+
# np.mean on an array of booleans returns fraction
164+
# of correct decisions for this fold:
165+
curmean = np.mean(prediction == labels[testing])
166+
means.append(curmean)
167+
print('Mean accuracy: {:.1%}'.format(np.mean(means)))

0 commit comments

Comments
 (0)
Please sign in to comment.