Moved around some of my models.

rpmcruz · Feb 12, 2017 · 6ccf625 · 6ccf625
1 parent d942b9e
commit 6ccf625
Show file tree

Hide file tree

Showing 20 changed files with 662 additions and 59 deletions.
diff --git a/classification/bagging/randomforest.py b/classification/bagging/randomforest.py
@@ -0,0 +1,22 @@
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import accuracy_score
+import numpy as np
+
+
+class MyRandomForestClassifier:
+    def __init__(self, n_estimators):
+        self.n_estimators = n_estimators
+
+    def fit(self, X, y):
+        self.trees = [DecisionTreeClassifier(random_state=1).fit(X, y)
+                      for _ in range(self.n_estimators)]
+        return self
+
+    def predict(self, X):
+        yp = [tree.predict(X) for tree in self.trees]
+        return ((np.sum(yp, 0) / len(self.trees)) > 0.5).astype(int)
+
+    def score(self, X, y):
+        return accuracy_score(y, self.predict(X))
diff --git a/cpp/models/README.md → classification/neuralnet/README.md b/cpp/models/README.md → classification/neuralnet/README.md
diff --git a/cpp/models/compile.sh → classification/neuralnet/cpp/compile.sh b/cpp/models/compile.sh → classification/neuralnet/cpp/compile.sh
diff --git a/cpp/models/ranknet.cpp → classification/neuralnet/cpp/neuralnet.cpp b/cpp/models/ranknet.cpp → classification/neuralnet/cpp/neuralnet.cpp
@@ -168,61 +168,6 @@ struct NeuralNet
     }
 };
 
-// RankNet as in Burges et al (2005)
-// Uses a neural network for ranking.
-
-struct RankNet : NeuralNet
-{
-    RankNet(int hidden_nodes) : NeuralNet(hidden_nodes, false) {
-    }
-
-    virtual void fit(mat X, const ivec& y, int maxit) {
-        build(X.n_cols, hidden_nodes);
-        X = fitnorm(X);
-
-/*        int n1 = 0;
-        for(unsigned int i = 0; i < y.n_elem; i++)
-            if(y[i] == 1)
-                n1++;
-        unsigned int _maxit = maxit/n1;
-        cout << "maxit: " << _maxit << endl;*/
-
-        for(int t = 0; t < maxit; t++) {
-            double errors = 0;
-            for(unsigned int i = 0; i < X.n_rows; i++)
-                for(unsigned int j = 0; j < X.n_rows; j++) {
-                    if(i == j)
-                        continue;
-#if 1  // ignore same -- they only affect the threshold, not the learning
-                    if(y[i] == y[j])
-                        continue;
-#endif
-                    double P = (y[i] - y[j] + 1)/2.;
-
-                    mat* l1 = fprop(X.row(i));
-                    mat* l2 = fprop(X.row(j));
-
-                    double s = l1[2][0] - l2[2][0];
-                    double C = exp(s)/(exp(s)+1) - P;
-
-                    backprop(C, -1, l1[0], l1[1], l1[2]);
-                    backprop(C, +1, l2[0], l2[1], l2[2]);
-                    errors += abs(C);
-                    delete [] l1;
-                    delete [] l2;
-                }
-            if(t % 100 == 0)
-                cout << t << " - " << errors << endl;
-            if(errors/(X.n_rows*(X.n_rows-1)) < 0.01)
-                break;
-        }
-
-        vec H(X.n_rows);
-        scores(X, false, H);
-        th = choose_threshold(H, y);
-    }
-};
-
 //** wrapper
 
 extern "C" {
@@ -247,8 +192,4 @@ extern "C" {
         vec _S((double*) S, N, false, true);
         nn->scores(_X, true, _S);
     }
-
-    NeuralNet* RankNet_new(int hidden_nodes, bool balanced) {
-        return new RankNet(hidden_nodes);
-    }
 }
diff --git a/classification/neuralnet/cpp/ranknet.cpp b/classification/neuralnet/cpp/ranknet.cpp
@@ -0,0 +1,64 @@
+#include "neuralnet.cpp"
+
+// RankNet as in Burges et al (2005)
+// Uses a neural network for ranking.
+
+struct RankNet : NeuralNet
+{
+    RankNet(int hidden_nodes) : NeuralNet(hidden_nodes, false) {
+    }
+
+    virtual void fit(mat X, const ivec& y, int maxit) {
+        build(X.n_cols, hidden_nodes);
+        X = fitnorm(X);
+
+/*        int n1 = 0;
+        for(unsigned int i = 0; i < y.n_elem; i++)
+            if(y[i] == 1)
+                n1++;
+        unsigned int _maxit = maxit/n1;
+        cout << "maxit: " << _maxit << endl;*/
+
+        for(int t = 0; t < maxit; t++) {
+            double errors = 0;
+            for(unsigned int i = 0; i < X.n_rows; i++)
+                for(unsigned int j = 0; j < X.n_rows; j++) {
+                    if(i == j)
+                        continue;
+#if 1  // ignore same -- they only affect the threshold, not the learning
+                    if(y[i] == y[j])
+                        continue;
+#endif
+                    double P = (y[i] - y[j] + 1)/2.;
+
+                    mat* l1 = fprop(X.row(i));
+                    mat* l2 = fprop(X.row(j));
+
+                    double s = l1[2][0] - l2[2][0];
+                    double C = exp(s)/(exp(s)+1) - P;
+
+                    backprop(C, -1, l1[0], l1[1], l1[2]);
+                    backprop(C, +1, l2[0], l2[1], l2[2]);
+                    errors += abs(C);
+                    delete [] l1;
+                    delete [] l2;
+                }
+            if(t % 100 == 0)
+                cout << t << " - " << errors << endl;
+            if(errors/(X.n_rows*(X.n_rows-1)) < 0.01)
+                break;
+        }
+
+        vec H(X.n_rows);
+        scores(X, false, H);
+        th = choose_threshold(H, y);
+    }
+};
+
+//** wrapper
+
+extern "C" {
+    NeuralNet* RankNet_new(int hidden_nodes, bool balanced) {
+        return new RankNet(hidden_nodes);
+    }
+}
diff --git a/classification/neuralnet/neuralnet.py b/classification/neuralnet/neuralnet.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+
+# Wrapper around cpp/neuralnet.cpp
+
+import os
+os.system('cpp/compile.sh')
+print 'compiled neuralnet.cpp -> libneuralnet.so'
+
+import ctypes
+lib = ctypes.cdll.LoadLibrary('cpp/libneuralnet.so')
+
+import numpy as np
+from sklearn.base import BaseEstimator, ClassifierMixin
+
+
+class NeuralNet(BaseEstimator, ClassifierMixin):
+    def __init__(self, hidden_nodes, balanced, ranker=False, maxit=1000):
+        self.obj = None
+        self.hidden_nodes = hidden_nodes
+        self.balanced = balanced
+        self.ranker = ranker
+        self.maxit = maxit
+        self.classes_ = (0, 1)
+
+    def __del__(self):
+        if self.obj is not None:
+            lib.NeuralNet_delete(self.obj)
+
+    def fit(self, X, y):
+        # it's ugly, but better to allocate nnet c++ here
+        if self.obj is not None:
+            lib.NeuralNet_delete(self.obj)
+
+        X = np.asarray(X, np.float64, 'F')
+        y = np.asarray(y, np.int32)
+        Xptr = ctypes.c_void_p(X.ctypes.data)
+        yptr = ctypes.c_void_p(y.ctypes.data)
+
+        if self.ranker:
+            self.obj = lib.RankNet_new(self.hidden_nodes)
+        else:
+            self.obj = lib.NeuralNet_new(self.hidden_nodes, self.balanced)
+        lib.NeuralNet_fit(self.obj, X.shape[1], X.shape[0], Xptr, yptr, self.maxit)
+        return self
+
+    def predict(self, X):
+        X = np.asarray(X, np.float64, 'F')
+        y = np.zeros(len(X), np.int32)
+        Xptr = ctypes.c_void_p(X.ctypes.data)
+        yptr = ctypes.c_void_p(y.ctypes.data)
+        lib.NeuralNet_predict(self.obj, X.shape[1], X.shape[0], Xptr, yptr)
+        return y
+
+    def predict_proba(self, X):
+        X = np.asarray(X, np.float64, 'F')
+        s = np.zeros(len(X), np.float64)
+        Xptr = ctypes.c_void_p(X.ctypes.data)
+        sptr = ctypes.c_void_p(s.ctypes.data)
+        lib.NeuralNet_scores(self.obj, X.shape[1], X.shape[0], Xptr, sptr)
+        return s
+
+
+class RankNet(NeuralNet):
+    def __init__(self, hidden_nodes, maxit=1000):
+        NeuralNet.__init__(self, hidden_nodes, False, True, maxit)
diff --git a/python/models/ranknet.py → classification/neuralnet/python/neuralnet.py b/python/models/ranknet.py → classification/neuralnet/python/neuralnet.py
diff --git a/python/preprocessing/metacost.py → preprocessing/metacost.py b/python/preprocessing/metacost.py → preprocessing/metacost.py
diff --git a/python/preprocessing/smote.py → preprocessing/smote.py b/python/preprocessing/smote.py → preprocessing/smote.py
diff --git a/quantile-classification/qbc.py b/quantile-classification/qbc.py
@@ -0,0 +1,58 @@
+# Songfeng Zheng, "QBoost: Predicting quantiles with boosting for regression
+# and binary classification" (2012)
+
+from sklearn.base import clone, BaseEstimator, ClassifierMixin
+from sklearn.tree import DecisionTreeRegressor
+import numpy as np
+from scipy.stats import norm
+
+
+def K(x, h):
+    return norm.pdf(x, scale=h)
+
+
+class ZerosDummyModel:
+    def __init__(self, tau):
+        pass
+
+    def fit(self, X, y):
+        return self
+
+    def predict(self, X):
+        return np.zeros(len(X))
+
+
+class QBC(BaseEstimator, ClassifierMixin):
+    def __init__(self, tau, M=100, eta=0.1, h=0.1, base_estimator=None):
+        self.tau = tau
+        self.M = M
+        self.eta = eta
+        self.h = h
+        self.first_estimator = ZerosDummyModel(tau)
+        if base_estimator is None:
+            base_estimator = DecisionTreeRegressor(max_depth=1)
+        self.base_estimator = base_estimator
+        self.classes_ = [0, 1]
+
+    def fit(self, X, y):
+        self.fs = [self.first_estimator]
+        # step 0
+        f = self.first_estimator.fit(X, y)
+        # step 1
+        for m in range(self.M):
+            f = self.predict_proba(X)
+            # step 2
+            U = (y-(1-self.tau))*K(f, self.h)
+            # step 3
+            g = clone(self.base_estimator).fit(X, U)
+            # step 4
+            self.fs.append(g)
+        return self
+
+    def predict_proba(self, X):
+        f0 = self.fs[0].predict(X)
+        r = np.sum([self.eta * f.predict(X) for f in self.fs[1:]], 0)
+        return f0 + r
+
+    def predict(self, X):
+        return (self.predict_proba(X) >= 0).astype(int)
diff --git a/quantile-classification/test.py b/quantile-classification/test.py
@@ -0,0 +1,42 @@
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.dummy import DummyClassifier
+from qbc import QBC
+from sklearn import datasets
+from sklearn.metrics import confusion_matrix
+import numpy as np
+import matplotlib.pyplot as plt
+
+print('Testing quantile regression...')
+
+_datasets = [
+    (datasets.load_iris, 1),
+    (lambda _: datasets.load_digits(10, True), 5),
+]
+
+quantiles = [0.1, 0.9]
+
+n_estimators = 100
+eta = 0.1
+
+models = [
+    ('dummy', lambda tau: DummyClassifier('most_frequent')),
+    ('gboost', lambda _: GradientBoostingClassifier(
+        learning_rate=eta, n_estimators=n_estimators, max_depth=1)),
+    ('qbc', lambda tau: QBC(tau, n_estimators, eta)),
+]
+
+for dataset, th in _datasets:
+    print('# dataset %s' % dataset.__name__)
+    X, y = dataset(True)
+    y = (y >= th).astype(int)
+    yps = [[np.zeros(len(y)) for _ in quantiles] for _ in models]
+    Xtr, Xts, ytr, yts = train_test_split(X, y, train_size=0.8)
+    for q, tau in enumerate(quantiles):
+        print('## quantile %.2f' % tau)
+        for i, (name, model) in enumerate(models):
+            m = model(tau).fit(Xtr, ytr)
+            yp = m.predict(Xts)
+
+            (TNR, FPR), (FNR, TPR) = confusion_matrix(yts, yp)/len(yts)
+            print('%10s: FP: %.3f, FN: %.3f' % (name, FPR, FNR))
diff --git a/quantile-regression/qbag.py b/quantile-regression/qbag.py
@@ -0,0 +1,26 @@
+import numpy as np
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.base import BaseEstimator, RegressorMixin
+
+
+# approach inspired by:
+# http://blog.datadive.net/prediction-intervals-for-random-forests/
+
+class QBag(BaseEstimator, RegressorMixin):
+    def __init__(self, tau, base_estimator=None):
+        self.tau = tau
+        if base_estimator is None:
+            base_estimator = RandomForestRegressor(100)
+        self.base_estimator = base_estimator
+
+    def fit(self, X, y):
+        self.base_estimator.fit(X, y)
+        return self
+
+    def predict(self, X):
+        yp = np.zeros(len(X))
+        ms = self.base_estimator.estimators_
+        for i, x in enumerate(X):
+            yps = [m.predict([x])[0] for m in ms]
+            yp[i] = np.percentile(yps, self.tau*100)
+        return yp