Skip to content

Commit

Permalink
Moved around some of my models.
Browse files Browse the repository at this point in the history
  • Loading branch information
rpmcruz committed Feb 12, 2017
1 parent d942b9e commit 6ccf625
Show file tree
Hide file tree
Showing 20 changed files with 662 additions and 59 deletions.
22 changes: 22 additions & 0 deletions classification/bagging/randomforest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np


class MyRandomForestClassifier:
def __init__(self, n_estimators):
self.n_estimators = n_estimators

def fit(self, X, y):
self.trees = [DecisionTreeClassifier(random_state=1).fit(X, y)
for _ in range(self.n_estimators)]
return self

def predict(self, X):
yp = [tree.predict(X) for tree in self.trees]
return ((np.sum(yp, 0) / len(self.trees)) > 0.5).astype(int)

def score(self, X, y):
return accuracy_score(y, self.predict(X))
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -168,61 +168,6 @@ struct NeuralNet
}
};

// RankNet as in Burges et al (2005)
// Uses a neural network for ranking.

struct RankNet : NeuralNet
{
RankNet(int hidden_nodes) : NeuralNet(hidden_nodes, false) {
}

virtual void fit(mat X, const ivec& y, int maxit) {
build(X.n_cols, hidden_nodes);
X = fitnorm(X);

/* int n1 = 0;
for(unsigned int i = 0; i < y.n_elem; i++)
if(y[i] == 1)
n1++;
unsigned int _maxit = maxit/n1;
cout << "maxit: " << _maxit << endl;*/

for(int t = 0; t < maxit; t++) {
double errors = 0;
for(unsigned int i = 0; i < X.n_rows; i++)
for(unsigned int j = 0; j < X.n_rows; j++) {
if(i == j)
continue;
#if 1 // ignore same -- they only affect the threshold, not the learning
if(y[i] == y[j])
continue;
#endif
double P = (y[i] - y[j] + 1)/2.;

mat* l1 = fprop(X.row(i));
mat* l2 = fprop(X.row(j));

double s = l1[2][0] - l2[2][0];
double C = exp(s)/(exp(s)+1) - P;

backprop(C, -1, l1[0], l1[1], l1[2]);
backprop(C, +1, l2[0], l2[1], l2[2]);
errors += abs(C);
delete [] l1;
delete [] l2;
}
if(t % 100 == 0)
cout << t << " - " << errors << endl;
if(errors/(X.n_rows*(X.n_rows-1)) < 0.01)
break;
}

vec H(X.n_rows);
scores(X, false, H);
th = choose_threshold(H, y);
}
};

//** wrapper

extern "C" {
Expand All @@ -247,8 +192,4 @@ extern "C" {
vec _S((double*) S, N, false, true);
nn->scores(_X, true, _S);
}

NeuralNet* RankNet_new(int hidden_nodes, bool balanced) {
return new RankNet(hidden_nodes);
}
}
64 changes: 64 additions & 0 deletions classification/neuralnet/cpp/ranknet.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#include "neuralnet.cpp"

// RankNet as in Burges et al (2005)
// Uses a neural network for ranking.

struct RankNet : NeuralNet
{
RankNet(int hidden_nodes) : NeuralNet(hidden_nodes, false) {
}

virtual void fit(mat X, const ivec& y, int maxit) {
build(X.n_cols, hidden_nodes);
X = fitnorm(X);

/* int n1 = 0;
for(unsigned int i = 0; i < y.n_elem; i++)
if(y[i] == 1)
n1++;
unsigned int _maxit = maxit/n1;
cout << "maxit: " << _maxit << endl;*/

for(int t = 0; t < maxit; t++) {
double errors = 0;
for(unsigned int i = 0; i < X.n_rows; i++)
for(unsigned int j = 0; j < X.n_rows; j++) {
if(i == j)
continue;
#if 1 // ignore same -- they only affect the threshold, not the learning
if(y[i] == y[j])
continue;
#endif
double P = (y[i] - y[j] + 1)/2.;

mat* l1 = fprop(X.row(i));
mat* l2 = fprop(X.row(j));

double s = l1[2][0] - l2[2][0];
double C = exp(s)/(exp(s)+1) - P;

backprop(C, -1, l1[0], l1[1], l1[2]);
backprop(C, +1, l2[0], l2[1], l2[2]);
errors += abs(C);
delete [] l1;
delete [] l2;
}
if(t % 100 == 0)
cout << t << " - " << errors << endl;
if(errors/(X.n_rows*(X.n_rows-1)) < 0.01)
break;
}

vec H(X.n_rows);
scores(X, false, H);
th = choose_threshold(H, y);
}
};

//** wrapper

extern "C" {
NeuralNet* RankNet_new(int hidden_nodes, bool balanced) {
return new RankNet(hidden_nodes);
}
}
65 changes: 65 additions & 0 deletions classification/neuralnet/neuralnet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# -*- coding: utf-8 -*-

# Wrapper around cpp/neuralnet.cpp

import os
os.system('cpp/compile.sh')
print 'compiled neuralnet.cpp -> libneuralnet.so'

import ctypes
lib = ctypes.cdll.LoadLibrary('cpp/libneuralnet.so')

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin


class NeuralNet(BaseEstimator, ClassifierMixin):
def __init__(self, hidden_nodes, balanced, ranker=False, maxit=1000):
self.obj = None
self.hidden_nodes = hidden_nodes
self.balanced = balanced
self.ranker = ranker
self.maxit = maxit
self.classes_ = (0, 1)

def __del__(self):
if self.obj is not None:
lib.NeuralNet_delete(self.obj)

def fit(self, X, y):
# it's ugly, but better to allocate nnet c++ here
if self.obj is not None:
lib.NeuralNet_delete(self.obj)

X = np.asarray(X, np.float64, 'F')
y = np.asarray(y, np.int32)
Xptr = ctypes.c_void_p(X.ctypes.data)
yptr = ctypes.c_void_p(y.ctypes.data)

if self.ranker:
self.obj = lib.RankNet_new(self.hidden_nodes)
else:
self.obj = lib.NeuralNet_new(self.hidden_nodes, self.balanced)
lib.NeuralNet_fit(self.obj, X.shape[1], X.shape[0], Xptr, yptr, self.maxit)
return self

def predict(self, X):
X = np.asarray(X, np.float64, 'F')
y = np.zeros(len(X), np.int32)
Xptr = ctypes.c_void_p(X.ctypes.data)
yptr = ctypes.c_void_p(y.ctypes.data)
lib.NeuralNet_predict(self.obj, X.shape[1], X.shape[0], Xptr, yptr)
return y

def predict_proba(self, X):
X = np.asarray(X, np.float64, 'F')
s = np.zeros(len(X), np.float64)
Xptr = ctypes.c_void_p(X.ctypes.data)
sptr = ctypes.c_void_p(s.ctypes.data)
lib.NeuralNet_scores(self.obj, X.shape[1], X.shape[0], Xptr, sptr)
return s


class RankNet(NeuralNet):
def __init__(self, hidden_nodes, maxit=1000):
NeuralNet.__init__(self, hidden_nodes, False, True, maxit)
File renamed without changes.
File renamed without changes.
File renamed without changes.
58 changes: 58 additions & 0 deletions quantile-classification/qbc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Songfeng Zheng, "QBoost: Predicting quantiles with boosting for regression
# and binary classification" (2012)

from sklearn.base import clone, BaseEstimator, ClassifierMixin
from sklearn.tree import DecisionTreeRegressor
import numpy as np
from scipy.stats import norm


def K(x, h):
return norm.pdf(x, scale=h)


class ZerosDummyModel:
def __init__(self, tau):
pass

def fit(self, X, y):
return self

def predict(self, X):
return np.zeros(len(X))


class QBC(BaseEstimator, ClassifierMixin):
def __init__(self, tau, M=100, eta=0.1, h=0.1, base_estimator=None):
self.tau = tau
self.M = M
self.eta = eta
self.h = h
self.first_estimator = ZerosDummyModel(tau)
if base_estimator is None:
base_estimator = DecisionTreeRegressor(max_depth=1)
self.base_estimator = base_estimator
self.classes_ = [0, 1]

def fit(self, X, y):
self.fs = [self.first_estimator]
# step 0
f = self.first_estimator.fit(X, y)
# step 1
for m in range(self.M):
f = self.predict_proba(X)
# step 2
U = (y-(1-self.tau))*K(f, self.h)
# step 3
g = clone(self.base_estimator).fit(X, U)
# step 4
self.fs.append(g)
return self

def predict_proba(self, X):
f0 = self.fs[0].predict(X)
r = np.sum([self.eta * f.predict(X) for f in self.fs[1:]], 0)
return f0 + r

def predict(self, X):
return (self.predict_proba(X) >= 0).astype(int)
42 changes: 42 additions & 0 deletions quantile-classification/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.dummy import DummyClassifier
from qbc import QBC
from sklearn import datasets
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt

print('Testing quantile regression...')

_datasets = [
(datasets.load_iris, 1),
(lambda _: datasets.load_digits(10, True), 5),
]

quantiles = [0.1, 0.9]

n_estimators = 100
eta = 0.1

models = [
('dummy', lambda tau: DummyClassifier('most_frequent')),
('gboost', lambda _: GradientBoostingClassifier(
learning_rate=eta, n_estimators=n_estimators, max_depth=1)),
('qbc', lambda tau: QBC(tau, n_estimators, eta)),
]

for dataset, th in _datasets:
print('# dataset %s' % dataset.__name__)
X, y = dataset(True)
y = (y >= th).astype(int)
yps = [[np.zeros(len(y)) for _ in quantiles] for _ in models]
Xtr, Xts, ytr, yts = train_test_split(X, y, train_size=0.8)
for q, tau in enumerate(quantiles):
print('## quantile %.2f' % tau)
for i, (name, model) in enumerate(models):
m = model(tau).fit(Xtr, ytr)
yp = m.predict(Xts)

(TNR, FPR), (FNR, TPR) = confusion_matrix(yts, yp)/len(yts)
print('%10s: FP: %.3f, FN: %.3f' % (name, FPR, FNR))
26 changes: 26 additions & 0 deletions quantile-regression/qbag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, RegressorMixin


# approach inspired by:
# http://blog.datadive.net/prediction-intervals-for-random-forests/

class QBag(BaseEstimator, RegressorMixin):
def __init__(self, tau, base_estimator=None):
self.tau = tau
if base_estimator is None:
base_estimator = RandomForestRegressor(100)
self.base_estimator = base_estimator

def fit(self, X, y):
self.base_estimator.fit(X, y)
return self

def predict(self, X):
yp = np.zeros(len(X))
ms = self.base_estimator.estimators_
for i, x in enumerate(X):
yps = [m.predict([x])[0] for m in ms]
yp[i] = np.percentile(yps, self.tau*100)
return yp
Loading

0 comments on commit 6ccf625

Please sign in to comment.