-
Notifications
You must be signed in to change notification settings - Fork 1.4k
/
Copy pathseeds_knn_sklearn.py
90 lines (71 loc) · 2.97 KB
/
seeds_knn_sklearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License
# Basic imports
from __future__ import print_function
import numpy as np
from load import load_dataset
# Import sklearn implementation of KNN
from sklearn.neighbors import KNeighborsClassifier
features, labels = load_dataset('seeds')
classifier = KNeighborsClassifier(n_neighbors=4)
n = len(features)
correct = 0.0
for ei in range(n):
training = np.ones(n, bool)
training[ei] = 0
testing = ~training
classifier.fit(features[training], labels[training])
pred = classifier.predict(features[ei])
correct += (pred == labels[ei])
print('Result of leave-one-out: {}'.format(correct/n))
# Import KFold object
from sklearn.cross_validation import KFold
# means will hold the mean for each fold
means = []
# kf is a generator of pairs (training,testing) so that each iteration
# implements a separate fold.
kf = KFold(len(features), n_folds=3, shuffle=True)
for training,testing in kf:
# We learn a model for this fold with `fit` and then apply it to the
# testing data with `predict`:
classifier.fit(features[training], labels[training])
prediction = classifier.predict(features[testing])
# np.mean on an array of booleans returns the fraction of correct decisions
# for this fold:
curmean = np.mean(prediction == labels[testing])
means.append(curmean)
print('Result of cross-validation using KFold: {}'.format(means))
# The function cross_val_score does the same thing as the loop above with a
# single function call
from sklearn.cross_validation import cross_val_score
crossed = cross_val_score(classifier, features, labels)
print('Result of cross-validation using cross_val_score: {}'.format(crossed))
# The results above use the features as is, which we learned was not optimal
# except if the features happen to all be in the same scale. We can pre-scale
# the features as explained in the main text:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)])
crossed = cross_val_score(classifier, features, labels)
print('Result with prescaling: {}'.format(crossed))
# Now, generate & print a cross-validated confusion matrix for the same result
from sklearn.metrics import confusion_matrix
names = list(set(labels))
labels = np.array([names.index(ell) for ell in labels])
preds = labels.copy()
preds[:] = -1
for train, test in kf:
classifier.fit(features[train], labels[train])
preds[test] = classifier.predict(features[test])
cmat = confusion_matrix(labels, preds)
print()
print('Confusion matrix: [rows represent true outcome, columns predicted outcome]')
print(cmat)
# The explicit float() conversion is necessary in Python 2
# (Otherwise, result is rounded to 0)
acc = cmat.trace()/float(cmat.sum())
print('Accuracy: {0:.1%}'.format(acc))