Skip to content

Commit

Permalink
finish gp.py
Browse files Browse the repository at this point in the history
  • Loading branch information
maplezzz committed Sep 16, 2018
1 parent 8232936 commit 5ad12dc
Show file tree
Hide file tree
Showing 10 changed files with 16,842 additions and 48 deletions.
9 changes: 9 additions & 0 deletions HW0/.idea/HW0.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

41 changes: 41 additions & 0 deletions HW0/.idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions HW0/.idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

284 changes: 284 additions & 0 deletions HW0/.idea/workspace.xml

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion HW1/gd.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ def GD(X, Y, w, eta, iteration, lambdaL2):
text = open('data/train.csv', 'r', encoding='big5')
row = csv.reader(text, delimiter=',')
for r in row:
# 第0行的数据无用 处理后data[0]表示WS_HR
if n_row != 0:
for i in range(3,27):
if r[i] != "NR":
Expand Down
46 changes: 0 additions & 46 deletions HW2/dataProcess.py

This file was deleted.

170 changes: 170 additions & 0 deletions HW2/gp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import pandas as pd
import numpy as np
from random import shuffle
from numpy.linalg import inv
from math import floor, log
import os
import argparse



output_dir = "output/"

def dataProcess_X(rawData):

#sex 只有两个属性 先drop之后处理
if "income" in rawData.columns:
Data = rawData.drop(["sex", 'income'], axis=1)
else:
Data = rawData.drop(["sex"], axis=1)
listObjectColumn = [col for col in Data.columns if Data[col].dtypes == "object"] #读取非数字的column
listNonObjedtColumn = [x for x in list(Data) if x not in listObjectColumn] #数字的column

ObjectData = Data[listObjectColumn]
NonObjectData = Data[listNonObjedtColumn]
#insert set into nonobject data with male = 0 and female = 1
NonObjectData.insert(0 ,"sex", (rawData["sex"] == " Female").astype(np.int))
#set every element in object rows as an attribute
ObjectData = pd.get_dummies(ObjectData)

Data = pd.concat([NonObjectData, ObjectData], axis=1)
Data_x = Data.astype("int64")
# Data_y = (rawData["income"] == " <=50K").astype(np.int)

#normalize
Data_x = (Data_x - Data_x.mean()) / Data_x.std()

return Data_x

def dataProcess_Y(rawData):
df_y = rawData['income']
Data_y = pd.DataFrame((df_y==' >50K').astype("int64"), columns=["income"])
return Data_y


def sigmoid(z):
res = 1 / (1.0 + np.exp(-z))
return np.clip(res, 1e-8, (1-(1e-8)))

def _shuffle(X, Y): #X and Y are np.array
randomize = np.arange(X.shape[0])
np.random.shuffle(randomize)
return (X[randomize], Y[randomize])

def split_valid_set(X, Y, percentage):
all_size = X.shape[0]
valid_size = int(floor(all_size * percentage))

X, Y = _shuffle(X, Y)
X_valid, Y_valid = X[ : valid_size], Y[ : valid_size]
X_train, Y_train = X[valid_size:], Y[valid_size:]

return X_train, Y_train, X_valid, Y_valid

def valid(X, Y, mu1, mu2, shared_sigma, N1, N2):
sigma_inv = inv(shared_sigma)
w = np.dot((mu1-mu2), sigma_inv)
X_t = X.T
b = (-0.5) * np.dot(np.dot(mu1.T, sigma_inv), mu1) + (0.5) * np.dot(np.dot(mu2.T, sigma_inv), mu2) + np.log(float(N1)/N2)
a = np.dot(w,X_t) + b
y = sigmoid(a)
y_ = np.around(y)
result = (np.squeeze(Y) == y_)
print('Valid acc = %f' % (float(result.sum()) / result.shape[0]))
return

def train(X_train, Y_train):
# vaild_set_percetange = 0.1
# X_train, Y_train, X_valid, Y_valid = split_valid_set(X, Y, vaild_set_percetange)

#Gussian distribution parameters
train_data_size = X_train.shape[0]

cnt1 = 0
cnt2 = 0

mu1 = np.zeros((106,))
mu2 = np.zeros((106,))
for i in range(train_data_size):
if Y_train[i] == 1: # >50k
mu1 += X_train[i]
cnt1 += 1
else:
mu2 += X_train[i]
cnt2 += 1
mu1 /= cnt1
mu2 /= cnt2

sigma1 = np.zeros((106, 106))
sigma2 = np.zeros((106, 106))
for i in range(train_data_size):
if Y_train[i] == 1:
sigma1 += np.dot(np.transpose([X_train[i] - mu1]), [X_train[i] - mu1])
else:
sigma2 += np.dot(np.transpose([X_train[i] - mu2]), [X_train[i] - mu2])

sigma1 /= cnt1
sigma2 /= cnt2
shared_sigma = (float(cnt1) / train_data_size) * sigma1 + (float(cnt2) / train_data_size) * sigma2

N1 = cnt1
N2 = cnt2

return mu1, mu2, shared_sigma, N1, N2

# print("==========Write output to %s ==============" % save_dir)
# if not os.path.exists(save_dir):
# os.mkdir(save_dir)
# param_dict = {'mu1': mu1, 'mu2':mu2, 'shared_sigma':shared_sigma,'N1':N1, 'N2':N2}
# for key in sorted(param_dict):
# print('Saving %s' % key)
# np.savetxt(os.path.join(save_dir, ('%s' % key)), param_dict[key])

# print("==========Validating============")
# valid(X_valid, Y_valid, mu1, mu2, shared_sigma, N1, N2)









if __name__ == "__main__":
trainData = pd.read_csv("data/train.csv")
testData = pd.read_csv("data/test.csv")

#here is one more attribute in trainData
x_train = dataProcess_X(trainData).drop(['native_country_ Holand-Netherlands'], axis=1).values
x_test = dataProcess_X(testData).values
y_train = dataProcess_Y(trainData).values

vaild_set_percetange = 0.1
X_train, Y_train, X_valid, Y_valid = split_valid_set(x_train, y_train, vaild_set_percetange)
mu1, mu2, shared_sigma, N1, N2 = train(X_train, Y_train)
valid(X_valid, Y_valid, mu1, mu2, shared_sigma, N1, N2)

mu1, mu2, shared_sigma, N1, N2 = train(x_train, y_train)
sigma_inv = inv(shared_sigma)
w = np.dot((mu1 - mu2), sigma_inv)
X_t = x_test.T
b = (-0.5) * np.dot(np.dot(mu1.T, sigma_inv), mu1) + (0.5) * np.dot(np.dot(mu2.T, sigma_inv), mu2) + np.log(
float(N1) / N2)
a = np.dot(w, X_t) + b
y = sigmoid(a)
y_ = np.around(y).astype(np.int)
df = pd.DataFrame({"id" : np.arange(1,16282), "label": y_})
if not os.path.exists(output_dir):
os.mkdir(output_dir)
df.to_csv(os.path.join(output_dir+'gd_output.csv'), sep='\t', index=False)










49 changes: 48 additions & 1 deletion HW2/hw2.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,53 @@
import csv
import matplotlib.pyplot as plt
import pandas as pd
#
trainData = pd.read_csv("data/train.csv")
testData = pd.read_csv("data/test.csv")

def dataProcess_X(rawData):

#sex 只有两个属性 先drop之后处理
if "income" in rawData.columns:
Data = rawData.drop(["sex", 'income'], axis=1)
else:
Data = rawData.drop(["sex"], axis=1)
listObjectColumn = [col for col in Data.columns if Data[col].dtypes == "object"] #读取非数字的column
listNonObjedtColumn = [x for x in list(Data) if x not in listObjectColumn] #数字的column

ObjectData = Data[listObjectColumn]
NonObjectData = Data[listNonObjedtColumn]
#insert set into nonobject data with male = 0 and female = 1
NonObjectData.insert(0 ,"sex", (rawData["sex"] == " Female").astype(np.int))
#set every element in object rows as an attribute
ObjectData = pd.get_dummies(ObjectData)

Data = pd.concat([NonObjectData, ObjectData], axis=1)
Data_x = Data.astype("int64")
# Data_y = (rawData["income"] == " <=50K").astype(np.int)

#normalize
Data_x = (Data_x - Data_x.mean()) / Data_x.std()

return Data_x

def dataProcess_Y(rawData):
df_y = rawData['income']
Data_y = pd.DataFrame((df_y==' >50K').astype("int64"), columns=["income"])
return Data_y

x_train = dataProcess_X(trainData)
x_test = dataProcess_X(testData)
y_train = dataProcess_Y(trainData)

a = x_train = dataProcess_X(trainData).drop(['native_country_ Holand-Netherlands'], axis=1)
print(list(set(list(x_train))-set(list(x_test))))
print(a.shape)







raw = pd.read_csv("data/train.csv", sep=',', header=0)

Empty file added HW2/lr.py
Empty file.
Loading

0 comments on commit 5ad12dc

Please sign in to comment.