|
| 1 | +# This code is supporting material for the book |
| 2 | +# Building Machine Learning Systems with Python |
| 3 | +# by Willi Richert and Luis Pedro Coelho |
| 4 | +# published by PACKT Publishing |
| 5 | +# |
| 6 | +# It is made available under the MIT License |
| 7 | + |
| 8 | + |
| 9 | +from matplotlib import pyplot as plt |
| 10 | +import numpy as np |
| 11 | + |
| 12 | +# We load the data with load_iris from sklearn |
| 13 | +from sklearn.datasets import load_iris |
| 14 | +data = load_iris() |
| 15 | + |
| 16 | +# load_iris returns an object with several fields |
| 17 | +features = data.data |
| 18 | +feature_names = data.feature_names |
| 19 | +target = data.target |
| 20 | +target_names = data.target_names |
| 21 | + |
| 22 | +for t in range(3): |
| 23 | + if t == 0: |
| 24 | + c = 'r' |
| 25 | + marker = '>' |
| 26 | + elif t == 1: |
| 27 | + c = 'g' |
| 28 | + marker = 'o' |
| 29 | + elif t == 2: |
| 30 | + c = 'b' |
| 31 | + marker = 'x' |
| 32 | + plt.scatter(features[target == t,0], |
| 33 | + features[target == t,1], |
| 34 | + marker=marker, |
| 35 | + c=c) |
| 36 | +# We use NumPy fancy indexing to get an array of strings: |
| 37 | +labels = target_names[target] |
| 38 | + |
| 39 | +# The petal length is the feature at position 2 |
| 40 | +plength = features[:, 2] |
| 41 | + |
| 42 | +# Build an array of booleans: |
| 43 | +is_setosa = (labels == 'setosa') |
| 44 | + |
| 45 | +# This is the important step: |
| 46 | +max_setosa =plength[is_setosa].max() |
| 47 | +min_non_setosa = plength[~is_setosa].min() |
| 48 | +print('Maximum of setosa: {0}.'.format(max_setosa)) |
| 49 | + |
| 50 | +print('Minimum of others: {0}.'.format(min_non_setosa)) |
| 51 | + |
| 52 | +# ~ is the boolean negation operator |
| 53 | +features = features[~is_setosa] |
| 54 | +labels = labels[~is_setosa] |
| 55 | +# Build a new target variable, is_virigina |
| 56 | +is_virginica = (labels == 'virginica') |
| 57 | + |
| 58 | +# Initialize best_acc to impossibly low value |
| 59 | +best_acc = -1.0 |
| 60 | +for fi in range(features.shape[1]): |
| 61 | + # We are going to test all possible thresholds |
| 62 | + thresh = features[:,fi] |
| 63 | + for t in thresh: |
| 64 | + |
| 65 | + # Get the vector for feature `fi` |
| 66 | + feature_i = features[:, fi] |
| 67 | + # apply threshold `t` |
| 68 | + pred = (feature_i > t) |
| 69 | + acc = (pred == is_virginica).mean() |
| 70 | + rev_acc = (pred == ~is_virginica).mean() |
| 71 | + if rev_acc > acc: |
| 72 | + reverse = True |
| 73 | + acc = rev_acc |
| 74 | + else: |
| 75 | + reverse = False |
| 76 | + |
| 77 | + if acc > best_acc: |
| 78 | + best_acc = acc |
| 79 | + best_fi = fi |
| 80 | + best_t = t |
| 81 | + best_reverse = reverse |
| 82 | + |
| 83 | +print(best_fi, best_t, best_reverse, best_acc) |
| 84 | + |
| 85 | +def is_virginica_test(fi, t, reverse, example): |
| 86 | + 'Apply threshold model to a new example' |
| 87 | + test = example[fi] > t |
| 88 | + if reverse: |
| 89 | + test = not test |
| 90 | + return test |
| 91 | +from threshold import fit_model, predict |
| 92 | + |
| 93 | +# ning accuracy was 96.0%. |
| 94 | +# ing accuracy was 90.0% (N = 50). |
| 95 | +correct = 0.0 |
| 96 | + |
| 97 | +for ei in range(len(features)): |
| 98 | + # select all but the one at position `ei`: |
| 99 | + training = np.ones(len(features), bool) |
| 100 | + training[ei] = False |
| 101 | + testing = ~training |
| 102 | + model = fit_model(features[training], is_virginica[training]) |
| 103 | + predict(model, features[testing]) |
| 104 | + predictions = predict(model, features[testing]) |
| 105 | + correct += np.sum(predictions == is_virginica[testing]) |
| 106 | +acc = correct/float(len(features)) |
| 107 | +print('Accuracy: {0:.1%}'.format(acc)) |
| 108 | + |
| 109 | + |
| 110 | +########################################### |
| 111 | +############## SEEDS DATASET ############## |
| 112 | +########################################### |
| 113 | + |
| 114 | +from load import load_dataset |
| 115 | + |
| 116 | +feature_names = [ |
| 117 | + 'area', |
| 118 | + 'perimeter', |
| 119 | + 'compactness', |
| 120 | + 'length of kernel', |
| 121 | + 'width of kernel', |
| 122 | + 'asymmetry coefficien', |
| 123 | + 'length of kernel groove', |
| 124 | +] |
| 125 | +features, labels = load_dataset('seeds') |
| 126 | + |
| 127 | + |
| 128 | + |
| 129 | +from sklearn.neighbors import KNeighborsClassifier |
| 130 | +classifier = KNeighborsClassifier(n_neighbors=1) |
| 131 | +from sklearn.cross_validation import KFold |
| 132 | + |
| 133 | +kf = KFold(len(features), n_folds=5, shuffle=True) |
| 134 | +means = [] |
| 135 | +for training,testing in kf: |
| 136 | + # We learn a model for this fold with `fit` and then apply it to the |
| 137 | + # testing data with `predict`: |
| 138 | + classifier.fit(features[training], labels[training]) |
| 139 | + prediction = classifier.predict(features[testing]) |
| 140 | + |
| 141 | + # np.mean on an array of booleans returns fraction |
| 142 | + # of correct decisions for this fold: |
| 143 | + curmean = np.mean(prediction == labels[testing]) |
| 144 | + means.append(curmean) |
| 145 | +print('Mean accuracy: {:.1%}'.format(np.mean(means))) |
| 146 | + |
| 147 | + |
| 148 | +from sklearn.pipeline import Pipeline |
| 149 | +from sklearn.preprocessing import StandardScaler |
| 150 | + |
| 151 | +classifier = KNeighborsClassifier(n_neighbors=1) |
| 152 | +classifier = Pipeline([('norm', StandardScaler()), ('knn', classifier)]) |
| 153 | + |
| 154 | + |
| 155 | + |
| 156 | +means = [] |
| 157 | +for training,testing in kf: |
| 158 | + # We learn a model for this fold with `fit` and then apply it to the |
| 159 | + # testing data with `predict`: |
| 160 | + classifier.fit(features[training], labels[training]) |
| 161 | + prediction = classifier.predict(features[testing]) |
| 162 | + |
| 163 | + # np.mean on an array of booleans returns fraction |
| 164 | + # of correct decisions for this fold: |
| 165 | + curmean = np.mean(prediction == labels[testing]) |
| 166 | + means.append(curmean) |
| 167 | +print('Mean accuracy: {:.1%}'.format(np.mean(means))) |
0 commit comments