Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
ujjwalkarn committed May 15, 2016
1 parent 5a12077 commit 349d4e6
Show file tree
Hide file tree
Showing 3 changed files with 405 additions and 0 deletions.
133 changes: 133 additions & 0 deletions Logistic-Regression/citreo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
'''
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
Version 2, December 2004
Copyright (C) 2004 Sam Hocevar <[email protected]>
Everyone is permitted to copy and distribute verbatim or modified
copies of this license document, and changing it is allowed as long
as the name is changed.
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. You just DO WHAT THE FUCK YOU WANT TO.
'''


from datetime import datetime
from csv import DictReader
from math import exp, log, sqrt


# parameters #################################################################

train = 'train.csv' # path to training file
test = 'test.csv' # path to testing file

D = 2 ** 20 # number of weights use for learning
alpha = .1 # learning rate for sgd optimization


# function definitions #######################################################

# A. Bounded logloss
# INPUT:
# p: our prediction
# y: real answer
# OUTPUT
# logarithmic loss of p given y
def logloss(p, y):
p = max(min(p, 1. - 10e-12), 10e-12)
return -log(p) if y == 1. else -log(1. - p)


# B. Apply hash trick of the original csv row
# for simplicity, we treat both integer and categorical features as categorical
# INPUT:
# csv_row: a csv dictionary, ex: {'Lable': '1', 'I1': '357', 'I2': '', ...}
# D: the max index that we can hash to
# OUTPUT:
# x: a list of indices that its value is 1
def get_x(csv_row, D):
x = [0] # 0 is the index of the bias term
for key, value in csv_row.items():
index = int(value + key[1:], 16) % D # weakest hash ever ;)
x.append(index)
return x # x contains indices of features that have a value of 1


# C. Get probability estimation on x
# INPUT:
# x: features
# w: weights
# OUTPUT:
# probability of p(y = 1 | x; w)
def get_p(x, w):
wTx = 0.
for i in x: # do wTx
wTx += w[i] * 1. # w[i] * x[i], but if i in x we got x[i] = 1.
return 1. / (1. + exp(-max(min(wTx, 20.), -20.))) # bounded sigmoid


# D. Update given model
# INPUT:
# w: weights
# n: a counter that counts the number of times we encounter a feature
# this is used for adaptive learning rate
# x: feature
# p: prediction of our model
# y: answer
# OUTPUT:
# w: updated model
# n: updated count
def update_w(w, n, x, p, y):
for i in x:
# alpha / (sqrt(n) + 1) is the adaptive learning rate heuristic
# (p - y) * x[i] is the current gradient
# note that in our case, if i in x then x[i] = 1
w[i] -= (p - y) * alpha / (sqrt(n[i]) + 1.)
n[i] += 1.

return w, n


# training and testing #######################################################

# initialize our model
w = [0.] * D # weights
n = [0.] * D # number of times we've encountered a feature

# start training a logistic regression model using on pass sgd
loss = 0.
for t, row in enumerate(DictReader(open(train))):
y = 1. if row['Label'] == '1' else 0.

del row['Label'] # can't let the model peek the answer
del row['Id'] # we don't need the Id

# main training procedure
# step 1, get the hashed features
x = get_x(row, D)

# step 2, get prediction
p = get_p(x, w)

# for progress validation, useless for learning our model
loss += logloss(p, y)
if t % 1000000 == 0 and t > 1:
print('%s\tencountered: %d\tcurrent logloss: %f' % (
datetime.now(), t, loss/t))

# step 3, update model with answer
w, n = update_w(w, n, x, p, y)

# testing (build kaggle's submission file)
with open('submission1234.csv', 'w') as submission:
submission.write('Id,Predicted\n')
for t, row in enumerate(DictReader(open(test))):
Id = row['Id']
del row['Id']
x = get_x(row, D)
p = get_p(x, w)
submission.write('%s,%f\n' % (Id, p))
106 changes: 106 additions & 0 deletions Logistic-Regression/classifier_corrected.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#https://www.kaggle.com/c/amazon-employee-access-challenge/forums/t/4797/starter-code-in-python-with-scikit-learn-auc-885

""" Amazon Access Challenge Starter Code
These files provide some starter code using
the scikit-learn library. It provides some examples on how
to design a simple algorithm, including pre-processing,
training a logistic regression classifier on the data,
assess its performance through cross-validation and some
pointers on where to go next.
Paul Duan <[email protected]>
"""

from __future__ import division

import numpy as np
from sklearn import (metrics, cross_validation, linear_model, preprocessing)

SEED = 42 # always use a seed for randomized procedures


def load_data(filename, use_labels=True):
"""
Load data from CSV files and return them as numpy arrays
The use_labels parameter indicates whether one should
read the first column (containing class labels). If false,
return all 0s.
"""

# load column 1 to 8 (ignore last one)
data = np.loadtxt(open("data/" + filename), delimiter=',',
usecols=range(1, 9), skiprows=1)
if use_labels:
labels = np.loadtxt(open("data/" + filename), delimiter=',',
usecols=[0], skiprows=1)
else:
labels = np.zeros(data.shape[0])
return labels, data


def save_results(predictions, filename):
"""Given a vector of predictions, save results in CSV format."""
with open(filename, 'w') as f:
f.write("id,ACTION\n")
for i, pred in enumerate(predictions):
f.write("%d,%f\n" % (i + 1, pred))


def main():
"""
Fit models and make predictions.
We'll use one-hot encoding to transform our categorical features
into binary features.
y and X will be numpy array objects.
"""
model = linear_model.LogisticRegression(C=3) # the classifier we'll use

# === load data in memory === #
print "loading data"
y, X = load_data('train.csv')
y_test, X_test = load_data('test.csv', use_labels=False)

# === one-hot encoding === #
# we want to encode the category IDs encountered both in
# the training and the test set, so we fit the encoder on both
encoder = preprocessing.OneHotEncoder()
encoder.fit(np.vstack((X, X_test)))
X = encoder.transform(X) # Returns a sparse matrix (see numpy.sparse)
X_test = encoder.transform(X_test)

# if you want to create new features, you'll need to compute them
# before the encoding, and append them to your dataset after

# === training & metrics === #
mean_auc = 0.0
n = 10 # repeat the CV procedure 10 times to get more precise results
for i in range(n):
# for each iteration, randomly hold out 20% of the data as CV set
X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
X, y, test_size=.20, random_state=i*SEED)

# if you want to perform feature selection / hyperparameter
# optimization, this is where you want to do it

# train model and make predictions
model.fit(X_train, y_train)
preds = model.predict_proba(X_cv)[:, 1]

# compute AUC metric for this CV fold
fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
roc_auc = metrics.auc(fpr, tpr)
print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc)
mean_auc += roc_auc

print "Mean AUC: %f" % (mean_auc/n)

# === Predictions === #
# When making predictions, retrain the model on the whole training set
model.fit(X, y)
preds = model.predict_proba(X_test)[:, 1]
filename = raw_input("Enter name for submission file: ")
save_results(preds, filename + ".csv")

if __name__ == '__main__':
main()
Loading

0 comments on commit 349d4e6

Please sign in to comment.