-
Notifications
You must be signed in to change notification settings - Fork 6.4k
/
Copy pathfake_neural_net.py
128 lines (110 loc) · 4.05 KB
/
fake_neural_net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# https://deeplearningcourses.com/c/support-vector-machines-in-python
# https://www.udemy.com/support-vector-machines-in-python
from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future
import numpy as np
from sklearn.svm import SVC
from util import getKaggleMNIST
from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from scipy import stats
from sklearn.linear_model import LogisticRegression
class SigmoidFeaturizer:
def __init__(self, gamma=1.0, n_components=100, method='random'):
self.M = n_components
self.gamma = gamma
assert(method in ('normal', 'random', 'kmeans', 'gmm'))
self.method = method
def _subsample_data(self, X, Y, n=10000):
if Y is not None:
X, Y = shuffle(X, Y)
return X[:n], Y[:n]
else:
X = shuffle(X)
return X[:n]
def fit(self, X, Y=None):
if self.method == 'random':
N = len(X)
idx = np.random.randint(N, size=self.M)
self.samples = X[idx]
elif self.method == 'normal':
# just sample from N(0,1)
D = X.shape[1]
self.samples = np.random.randn(self.M, D) / np.sqrt(D)
elif self.method == 'kmeans':
X, Y = self._subsample_data(X, Y)
print("Fitting kmeans...")
t0 = datetime.now()
kmeans = KMeans(n_clusters=len(set(Y)))
kmeans.fit(X)
print("Finished fitting kmeans, duration:", datetime.now() - t0)
# calculate the most ambiguous points
# we will do this by finding the distance between each point
# and all cluster centers
# and return which points have the smallest variance
dists = kmeans.transform(X) # returns an N x K matrix
variances = dists.var(axis=1)
idx = np.argsort(variances) # smallest to largest
idx = idx[:self.M]
self.samples = X[idx]
elif self.method == 'gmm':
X, Y = self._subsample_data(X, Y)
print("Fitting GMM")
t0 = datetime.now()
gmm = GaussianMixture(
n_components=len(set(Y)),
covariance_type='spherical',
reg_covar=1e-6)
gmm.fit(X)
print("Finished fitting GMM, duration:", datetime.now() - t0)
# calculate the most ambiguous points
probs = gmm.predict_proba(X)
ent = stats.entropy(probs.T) # N-length vector of entropies
idx = np.argsort(-ent) # negate since we want biggest first
idx = idx[:self.M]
self.samples = X[idx]
return self
def transform(self, X):
Z = X.dot(self.samples.T) # (Ntest x D) x (D x Nsamples) -> (Ntest x Nsamples)
return np.tanh(self.gamma * Z)
# return self.gamma * Z * (Z > 0)
def fit_transform(self, X, Y=None):
return self.fit(X, Y).transform(X)
# get the data: https://www.kaggle.com/c/digit-recognizer
Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST()
# with SGD
pipeline = Pipeline([
('scaler', StandardScaler()),
('sigmoid', SigmoidFeaturizer(gamma=0.05, n_components=2000, method='normal')),
# ('linear', SGDClassifier(max_iter=1e6, tol=1e-5))
('linear', LogisticRegression()) # takes longer
])
# with Linear SVC
# n_components = 3000
# pipeline = Pipeline([
# ('scaler', StandardScaler()),
# ('sigmoid', SigmoidFeaturizer(n_components=n_components)),
# ('linear', LinearSVC())
# ])
# let's do some cross-validation instead, why not
X = np.vstack((Xtrain, Xtest))
Y = np.concatenate((Ytrain, Ytest))
scores = cross_val_score(pipeline, X, Y, cv=5)
print(scores)
print("avg:", np.mean(scores))
# t0 = datetime.now()
# pipeline.fit(Xtrain, Ytrain)
# print("train duration:", datetime.now() - t0)
# t0 = datetime.now()
# print("train score:", pipeline.score(Xtrain, Ytrain), "duration:", datetime.now() - t0)
# t0 = datetime.now()
# print("test score:", pipeline.score(Xtest, Ytest), "duration:", datetime.now() - t0)