Add files via upload

spandar · May 15, 2016 · 349d4e6 · 349d4e6
1 parent 5a12077
commit 349d4e6
Show file tree

Hide file tree

Showing 3 changed files with 405 additions and 0 deletions.
diff --git a/Logistic-Regression/citreo.py b/Logistic-Regression/citreo.py
@@ -0,0 +1,133 @@
+'''
+           DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+                   Version 2, December 2004
+
+Copyright (C) 2004 Sam Hocevar <[email protected]>
+
+Everyone is permitted to copy and distribute verbatim or modified
+copies of this license document, and changing it is allowed as long
+as the name is changed.
+
+           DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+  TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. You just DO WHAT THE FUCK YOU WANT TO.
+'''
+
+
+from datetime import datetime
+from csv import DictReader
+from math import exp, log, sqrt
+
+
+# parameters #################################################################
+
+train = 'train.csv'  # path to training file
+test = 'test.csv'  # path to testing file
+
+D = 2 ** 20   # number of weights use for learning
+alpha = .1    # learning rate for sgd optimization
+
+
+# function definitions #######################################################
+
+# A. Bounded logloss
+# INPUT:
+#     p: our prediction
+#     y: real answer
+# OUTPUT
+#     logarithmic loss of p given y
+def logloss(p, y):
+    p = max(min(p, 1. - 10e-12), 10e-12)
+    return -log(p) if y == 1. else -log(1. - p)
+
+
+# B. Apply hash trick of the original csv row
+# for simplicity, we treat both integer and categorical features as categorical
+# INPUT:
+#     csv_row: a csv dictionary, ex: {'Lable': '1', 'I1': '357', 'I2': '', ...}
+#     D: the max index that we can hash to
+# OUTPUT:
+#     x: a list of indices that its value is 1
+def get_x(csv_row, D):
+    x = [0]  # 0 is the index of the bias term
+    for key, value in csv_row.items():
+        index = int(value + key[1:], 16) % D  # weakest hash ever ;)
+        x.append(index)
+    return x  # x contains indices of features that have a value of 1
+
+
+# C. Get probability estimation on x
+# INPUT:
+#     x: features
+#     w: weights
+# OUTPUT:
+#     probability of p(y = 1 | x; w)
+def get_p(x, w):
+    wTx = 0.
+    for i in x:  # do wTx
+        wTx += w[i] * 1.  # w[i] * x[i], but if i in x we got x[i] = 1.
+    return 1. / (1. + exp(-max(min(wTx, 20.), -20.)))  # bounded sigmoid
+
+
+# D. Update given model
+# INPUT:
+#     w: weights
+#     n: a counter that counts the number of times we encounter a feature
+#        this is used for adaptive learning rate
+#     x: feature
+#     p: prediction of our model
+#     y: answer
+# OUTPUT:
+#     w: updated model
+#     n: updated count
+def update_w(w, n, x, p, y):
+    for i in x:
+        # alpha / (sqrt(n) + 1) is the adaptive learning rate heuristic
+        # (p - y) * x[i] is the current gradient
+        # note that in our case, if i in x then x[i] = 1
+        w[i] -= (p - y) * alpha / (sqrt(n[i]) + 1.)
+        n[i] += 1.
+
+    return w, n
+
+
+# training and testing #######################################################
+
+# initialize our model
+w = [0.] * D  # weights
+n = [0.] * D  # number of times we've encountered a feature
+
+# start training a logistic regression model using on pass sgd
+loss = 0.
+for t, row in enumerate(DictReader(open(train))):
+    y = 1. if row['Label'] == '1' else 0.
+
+    del row['Label']  # can't let the model peek the answer
+    del row['Id']  # we don't need the Id
+
+    # main training procedure
+    # step 1, get the hashed features
+    x = get_x(row, D)
+
+    # step 2, get prediction
+    p = get_p(x, w)
+
+    # for progress validation, useless for learning our model
+    loss += logloss(p, y)
+    if t % 1000000 == 0 and t > 1:
+        print('%s\tencountered: %d\tcurrent logloss: %f' % (
+            datetime.now(), t, loss/t))
+
+    # step 3, update model with answer
+    w, n = update_w(w, n, x, p, y)
+
+# testing (build kaggle's submission file)
+with open('submission1234.csv', 'w') as submission:
+    submission.write('Id,Predicted\n')
+    for t, row in enumerate(DictReader(open(test))):
+        Id = row['Id']
+        del row['Id']
+        x = get_x(row, D)
+        p = get_p(x, w)
+        submission.write('%s,%f\n' % (Id, p))
diff --git a/Logistic-Regression/classifier_corrected.py b/Logistic-Regression/classifier_corrected.py
@@ -0,0 +1,106 @@
+#https://www.kaggle.com/c/amazon-employee-access-challenge/forums/t/4797/starter-code-in-python-with-scikit-learn-auc-885
+
+""" Amazon Access Challenge Starter Code
+
+These files provide some starter code using 
+the scikit-learn library. It provides some examples on how
+to design a simple algorithm, including pre-processing,
+training a logistic regression classifier on the data,
+assess its performance through cross-validation and some 
+pointers on where to go next.
+
+Paul Duan <[email protected]>
+"""
+
+from __future__ import division
+
+import numpy as np
+from sklearn import (metrics, cross_validation, linear_model, preprocessing)
+
+SEED = 42  # always use a seed for randomized procedures
+
+
+def load_data(filename, use_labels=True):
+    """
+    Load data from CSV files and return them as numpy arrays
+    The use_labels parameter indicates whether one should
+    read the first column (containing class labels). If false,
+    return all 0s. 
+    """
+
+    # load column 1 to 8 (ignore last one)
+    data = np.loadtxt(open("data/" + filename), delimiter=',',
+                      usecols=range(1, 9), skiprows=1)
+    if use_labels:
+        labels = np.loadtxt(open("data/" + filename), delimiter=',',
+                            usecols=[0], skiprows=1)
+    else:
+        labels = np.zeros(data.shape[0])
+    return labels, data
+
+
+def save_results(predictions, filename):
+    """Given a vector of predictions, save results in CSV format."""
+    with open(filename, 'w') as f:
+        f.write("id,ACTION\n")
+        for i, pred in enumerate(predictions):
+            f.write("%d,%f\n" % (i + 1, pred))
+
+
+def main():
+    """
+    Fit models and make predictions.
+    We'll use one-hot encoding to transform our categorical features
+    into binary features.
+    y and X will be numpy array objects.
+    """
+    model = linear_model.LogisticRegression(C=3)  # the classifier we'll use
+
+    # === load data in memory === #
+    print "loading data"
+    y, X = load_data('train.csv')
+    y_test, X_test = load_data('test.csv', use_labels=False)
+
+    # === one-hot encoding === #
+    # we want to encode the category IDs encountered both in
+    # the training and the test set, so we fit the encoder on both
+    encoder = preprocessing.OneHotEncoder()
+    encoder.fit(np.vstack((X, X_test)))
+    X = encoder.transform(X)  # Returns a sparse matrix (see numpy.sparse)
+    X_test = encoder.transform(X_test)
+
+    # if you want to create new features, you'll need to compute them
+    # before the encoding, and append them to your dataset after
+
+    # === training & metrics === #
+    mean_auc = 0.0
+    n = 10  # repeat the CV procedure 10 times to get more precise results
+    for i in range(n):
+        # for each iteration, randomly hold out 20% of the data as CV set
+        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
+            X, y, test_size=.20, random_state=i*SEED)
+
+        # if you want to perform feature selection / hyperparameter
+        # optimization, this is where you want to do it
+
+        # train model and make predictions
+        model.fit(X_train, y_train) 
+        preds = model.predict_proba(X_cv)[:, 1]
+
+        # compute AUC metric for this CV fold
+        fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
+        roc_auc = metrics.auc(fpr, tpr)
+        print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc)
+        mean_auc += roc_auc
+
+    print "Mean AUC: %f" % (mean_auc/n)
+
+    # === Predictions === #
+    # When making predictions, retrain the model on the whole training set
+    model.fit(X, y)
+    preds = model.predict_proba(X_test)[:, 1]
+    filename = raw_input("Enter name for submission file: ")
+    save_results(preds, filename + ".csv")
+
+if __name__ == '__main__':
+    main()