forked from ujjwalkarn/DataSciencePython
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5a12077
commit 349d4e6
Showing
3 changed files
with
405 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
''' | ||
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | ||
Version 2, December 2004 | ||
Copyright (C) 2004 Sam Hocevar <[email protected]> | ||
Everyone is permitted to copy and distribute verbatim or modified | ||
copies of this license document, and changing it is allowed as long | ||
as the name is changed. | ||
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE | ||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION | ||
0. You just DO WHAT THE FUCK YOU WANT TO. | ||
''' | ||
|
||
|
||
from datetime import datetime | ||
from csv import DictReader | ||
from math import exp, log, sqrt | ||
|
||
|
||
# parameters ################################################################# | ||
|
||
train = 'train.csv' # path to training file | ||
test = 'test.csv' # path to testing file | ||
|
||
D = 2 ** 20 # number of weights use for learning | ||
alpha = .1 # learning rate for sgd optimization | ||
|
||
|
||
# function definitions ####################################################### | ||
|
||
# A. Bounded logloss | ||
# INPUT: | ||
# p: our prediction | ||
# y: real answer | ||
# OUTPUT | ||
# logarithmic loss of p given y | ||
def logloss(p, y): | ||
p = max(min(p, 1. - 10e-12), 10e-12) | ||
return -log(p) if y == 1. else -log(1. - p) | ||
|
||
|
||
# B. Apply hash trick of the original csv row | ||
# for simplicity, we treat both integer and categorical features as categorical | ||
# INPUT: | ||
# csv_row: a csv dictionary, ex: {'Lable': '1', 'I1': '357', 'I2': '', ...} | ||
# D: the max index that we can hash to | ||
# OUTPUT: | ||
# x: a list of indices that its value is 1 | ||
def get_x(csv_row, D): | ||
x = [0] # 0 is the index of the bias term | ||
for key, value in csv_row.items(): | ||
index = int(value + key[1:], 16) % D # weakest hash ever ;) | ||
x.append(index) | ||
return x # x contains indices of features that have a value of 1 | ||
|
||
|
||
# C. Get probability estimation on x | ||
# INPUT: | ||
# x: features | ||
# w: weights | ||
# OUTPUT: | ||
# probability of p(y = 1 | x; w) | ||
def get_p(x, w): | ||
wTx = 0. | ||
for i in x: # do wTx | ||
wTx += w[i] * 1. # w[i] * x[i], but if i in x we got x[i] = 1. | ||
return 1. / (1. + exp(-max(min(wTx, 20.), -20.))) # bounded sigmoid | ||
|
||
|
||
# D. Update given model | ||
# INPUT: | ||
# w: weights | ||
# n: a counter that counts the number of times we encounter a feature | ||
# this is used for adaptive learning rate | ||
# x: feature | ||
# p: prediction of our model | ||
# y: answer | ||
# OUTPUT: | ||
# w: updated model | ||
# n: updated count | ||
def update_w(w, n, x, p, y): | ||
for i in x: | ||
# alpha / (sqrt(n) + 1) is the adaptive learning rate heuristic | ||
# (p - y) * x[i] is the current gradient | ||
# note that in our case, if i in x then x[i] = 1 | ||
w[i] -= (p - y) * alpha / (sqrt(n[i]) + 1.) | ||
n[i] += 1. | ||
|
||
return w, n | ||
|
||
|
||
# training and testing ####################################################### | ||
|
||
# initialize our model | ||
w = [0.] * D # weights | ||
n = [0.] * D # number of times we've encountered a feature | ||
|
||
# start training a logistic regression model using on pass sgd | ||
loss = 0. | ||
for t, row in enumerate(DictReader(open(train))): | ||
y = 1. if row['Label'] == '1' else 0. | ||
|
||
del row['Label'] # can't let the model peek the answer | ||
del row['Id'] # we don't need the Id | ||
|
||
# main training procedure | ||
# step 1, get the hashed features | ||
x = get_x(row, D) | ||
|
||
# step 2, get prediction | ||
p = get_p(x, w) | ||
|
||
# for progress validation, useless for learning our model | ||
loss += logloss(p, y) | ||
if t % 1000000 == 0 and t > 1: | ||
print('%s\tencountered: %d\tcurrent logloss: %f' % ( | ||
datetime.now(), t, loss/t)) | ||
|
||
# step 3, update model with answer | ||
w, n = update_w(w, n, x, p, y) | ||
|
||
# testing (build kaggle's submission file) | ||
with open('submission1234.csv', 'w') as submission: | ||
submission.write('Id,Predicted\n') | ||
for t, row in enumerate(DictReader(open(test))): | ||
Id = row['Id'] | ||
del row['Id'] | ||
x = get_x(row, D) | ||
p = get_p(x, w) | ||
submission.write('%s,%f\n' % (Id, p)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
#https://www.kaggle.com/c/amazon-employee-access-challenge/forums/t/4797/starter-code-in-python-with-scikit-learn-auc-885 | ||
|
||
""" Amazon Access Challenge Starter Code | ||
These files provide some starter code using | ||
the scikit-learn library. It provides some examples on how | ||
to design a simple algorithm, including pre-processing, | ||
training a logistic regression classifier on the data, | ||
assess its performance through cross-validation and some | ||
pointers on where to go next. | ||
Paul Duan <[email protected]> | ||
""" | ||
|
||
from __future__ import division | ||
|
||
import numpy as np | ||
from sklearn import (metrics, cross_validation, linear_model, preprocessing) | ||
|
||
SEED = 42 # always use a seed for randomized procedures | ||
|
||
|
||
def load_data(filename, use_labels=True): | ||
""" | ||
Load data from CSV files and return them as numpy arrays | ||
The use_labels parameter indicates whether one should | ||
read the first column (containing class labels). If false, | ||
return all 0s. | ||
""" | ||
|
||
# load column 1 to 8 (ignore last one) | ||
data = np.loadtxt(open("data/" + filename), delimiter=',', | ||
usecols=range(1, 9), skiprows=1) | ||
if use_labels: | ||
labels = np.loadtxt(open("data/" + filename), delimiter=',', | ||
usecols=[0], skiprows=1) | ||
else: | ||
labels = np.zeros(data.shape[0]) | ||
return labels, data | ||
|
||
|
||
def save_results(predictions, filename): | ||
"""Given a vector of predictions, save results in CSV format.""" | ||
with open(filename, 'w') as f: | ||
f.write("id,ACTION\n") | ||
for i, pred in enumerate(predictions): | ||
f.write("%d,%f\n" % (i + 1, pred)) | ||
|
||
|
||
def main(): | ||
""" | ||
Fit models and make predictions. | ||
We'll use one-hot encoding to transform our categorical features | ||
into binary features. | ||
y and X will be numpy array objects. | ||
""" | ||
model = linear_model.LogisticRegression(C=3) # the classifier we'll use | ||
|
||
# === load data in memory === # | ||
print "loading data" | ||
y, X = load_data('train.csv') | ||
y_test, X_test = load_data('test.csv', use_labels=False) | ||
|
||
# === one-hot encoding === # | ||
# we want to encode the category IDs encountered both in | ||
# the training and the test set, so we fit the encoder on both | ||
encoder = preprocessing.OneHotEncoder() | ||
encoder.fit(np.vstack((X, X_test))) | ||
X = encoder.transform(X) # Returns a sparse matrix (see numpy.sparse) | ||
X_test = encoder.transform(X_test) | ||
|
||
# if you want to create new features, you'll need to compute them | ||
# before the encoding, and append them to your dataset after | ||
|
||
# === training & metrics === # | ||
mean_auc = 0.0 | ||
n = 10 # repeat the CV procedure 10 times to get more precise results | ||
for i in range(n): | ||
# for each iteration, randomly hold out 20% of the data as CV set | ||
X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( | ||
X, y, test_size=.20, random_state=i*SEED) | ||
|
||
# if you want to perform feature selection / hyperparameter | ||
# optimization, this is where you want to do it | ||
|
||
# train model and make predictions | ||
model.fit(X_train, y_train) | ||
preds = model.predict_proba(X_cv)[:, 1] | ||
|
||
# compute AUC metric for this CV fold | ||
fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds) | ||
roc_auc = metrics.auc(fpr, tpr) | ||
print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc) | ||
mean_auc += roc_auc | ||
|
||
print "Mean AUC: %f" % (mean_auc/n) | ||
|
||
# === Predictions === # | ||
# When making predictions, retrain the model on the whole training set | ||
model.fit(X, y) | ||
preds = model.predict_proba(X_test)[:, 1] | ||
filename = raw_input("Enter name for submission file: ") | ||
save_results(preds, filename + ".csv") | ||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.