Skip to content

Commit 9f56714

Browse files
author
Rupak Chakraborty
committed
Walmart Recruiting data added
1 parent 8557fc2 commit 9f56714

6 files changed

+232
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import pandas as pd
2+
from sklearn.ensemble import RandomForestRegressor
3+
from sklearn.ensemble import BaggingRegressor
4+
from sklearn.ensemble import ExtraTreesRegressor
5+
from sklearn.ensemble import AdaBoostRegressor
6+
from sklearn.ensemble import GradientBoostingRegressor
7+
from sklearn.ensemble import RandomTreesEmbedding
8+
from sklearn.neural_network import MLPRegressor
9+
from sklearn.linear_model import ElasticNet
10+
from sklearn.gaussian_process import GaussianProcessRegressor
11+
from sklearn.model_selection import train_test_split
12+
from sklearn.preprocessing import LabelEncoder
13+
from sklearn.preprocessing import Imputer
14+
from sklearn import metrics
15+
import numpy as np
16+
17+
18+
def get_gaussian_process_regressor():
19+
gp = GaussianProcessRegressor()
20+
return [gp],['Gaussian Process']
21+
22+
23+
def get_mlp_regressor(num_hidden_units=51):
24+
mlp = MLPRegressor(hidden_layer_sizes=num_hidden_units)
25+
return [mlp],['Multi-Layer Perceptron']
26+
27+
28+
def get_ensemble_models():
29+
rf = RandomForestRegressor(n_estimators=51,min_samples_leaf=5,min_samples_split=3,random_state=42)
30+
bag = BaggingRegressor(n_estimators=51,random_state=42)
31+
extra = ExtraTreesRegressor(n_estimators=71,random_state=42)
32+
ada = AdaBoostRegressor(random_state=42)
33+
grad = GradientBoostingRegressor(n_estimators=101,random_state=42)
34+
classifier_list = [rf,bag,extra,ada,grad]
35+
classifier_name_list = ['Random Forests','Bagging','Extra Trees','AdaBoost','Gradient Boost']
36+
return classifier_list, classifier_name_list
37+
38+
39+
def get_linear_model():
40+
elastic_net = ElasticNet()
41+
return [elastic_net],['Elastic Net']
42+
43+
44+
def print_evaluation_metrics(trained_model,trained_model_name,X_test,y_test):
45+
print '--------- For Model : ', trained_model_name ,' ---------\n'
46+
predicted_values = trained_model.predict(X_test)
47+
print "Mean Absolute Error : ", metrics.mean_absolute_error(y_test,predicted_values)
48+
print "Median Absolute Error : ", metrics.median_absolute_error(y_test,predicted_values)
49+
print "Mean Squared Error : ", metrics.mean_squared_error(y_test,predicted_values)
50+
print "R2 Score : ", metrics.r2_score(y_test,predicted_values)
51+
print "---------------------------------------\n"
52+
53+
54+
def label_encode_frame(dataframe):
55+
columns = dataframe.columns
56+
encoder = LabelEncoder()
57+
for column in columns:
58+
if type(dataframe[column][0]) is np.nan:
59+
for i in range(len(dataframe)):
60+
if i > 1000:
61+
break
62+
if type(dataframe[column][i]) is str:
63+
dataframe[column] = encoder.fit_transform(dataframe[column].values)
64+
break
65+
elif type(dataframe[column][0]) is str:
66+
dataframe[column] = encoder.fit_transform(dataframe[column].values)
67+
return dataframe
68+
69+
70+
def spilt_date(list_of_date_string,separator='-',format='yyyy-mm-dd'):
71+
month_list = list([])
72+
day_list = list([])
73+
year_list = list([])
74+
for date_string in list_of_date_string:
75+
date_list = date_string.strip().split(separator)
76+
month_list.append(date_list[1])
77+
day_list.append(date_list[2])
78+
year_list.append(date_list[0])
79+
return month_list,day_list,year_list
80+
81+
82+
def isfloat(value):
83+
try:
84+
float(value)
85+
return True
86+
except:
87+
return False
88+
89+
90+
def handle_mixed_data_types(dataframe):
91+
for column_name in dataframe.columns:
92+
column_data = list(dataframe[column_name].values)
93+
float_count = 0
94+
float_sum = 0.0
95+
string_count = 0
96+
for data in column_data:
97+
if isfloat(data):
98+
float_count += 1.0
99+
float_sum += float(data)
100+
else:
101+
string_count += 1
102+
if float_count >= string_count:
103+
mean = float_sum/float_count
104+
for index,value in enumerate(column_data):
105+
if not isfloat(value):
106+
column_data[index] = mean
107+
else:
108+
column_data[index] = float(value)
109+
dataframe[column_name] = column_data
110+
return dataframe
111+
112+
113+
weather_filename = 'weather.csv'
114+
train_filename = 'train.csv'
115+
key_filename = 'key.csv'
116+
weather_frame = pd.read_csv(weather_filename)
117+
train_frame = pd.read_csv(train_filename)
118+
key_frame = pd.read_csv(key_filename)
119+
weather_frame.drop(['codesum','depart'],axis=1,inplace=True)
120+
weather_frame = handle_mixed_data_types(weather_frame)
121+
final_frame = pd.merge(train_frame,key_frame,how='inner',left_on='store_nbr',right_on='store_nbr')
122+
final_frame = pd.merge(final_frame,weather_frame,how='inner',left_on=['station_nbr','date'],right_on=['station_nbr','date'])
123+
target_values = list(final_frame['units'].values)
124+
final_frame['month'], final_frame['day'], final_frame['year'] = spilt_date(list(final_frame['date'].values))
125+
del final_frame['units']
126+
del final_frame['date']
127+
X_train,X_test,y_train,y_test = train_test_split(final_frame.values,target_values,test_size=0.2,random_state=42)
128+
regressor_list,regressor_name_list = get_ensemble_models()
129+
for regressor,regressor_name in zip(regressor_list,regressor_name_list):
130+
regressor.fit(X_train,y_train)
131+
print_evaluation_metrics(regressor,regressor_name,X_test,y_test)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import pandas as pd
2+
from sklearn.ensemble import RandomForestRegressor
3+
from sklearn.ensemble import BaggingRegressor
4+
from sklearn.ensemble import ExtraTreesRegressor
5+
from sklearn.ensemble import AdaBoostRegressor
6+
from sklearn.ensemble import GradientBoostingRegressor
7+
from sklearn.ensemble import RandomTreesEmbedding
8+
from sklearn.neural_network import MLPRegressor
9+
from sklearn.linear_model import ElasticNet
10+
from sklearn.gaussian_process import GaussianProcessRegressor
11+
from sklearn.model_selection import train_test_split
12+
from sklearn.preprocessing import LabelEncoder
13+
from sklearn.preprocessing import Imputer
14+
from sklearn import metrics
15+
import numpy as np
16+
17+
18+
def get_gaussian_process_regressor():
19+
gp = GaussianProcessRegressor()
20+
return [gp],['Gaussian Process']
21+
22+
23+
def get_mlp_regressor(num_hidden_units=51):
24+
mlp = MLPRegressor(hidden_layer_sizes=num_hidden_units)
25+
return [mlp],['Multi-Layer Perceptron']
26+
27+
28+
def get_ensemble_models():
29+
rf = RandomForestRegressor(n_estimators=51,min_samples_leaf=5,min_samples_split=3,random_state=42)
30+
bag = BaggingRegressor(n_estimators=51,random_state=42)
31+
extra = ExtraTreesRegressor(n_estimators=71,random_state=42)
32+
ada = AdaBoostRegressor(random_state=42)
33+
grad = GradientBoostingRegressor(n_estimators=101,random_state=42)
34+
classifier_list = [rf,bag,extra,ada,grad]
35+
classifier_name_list = ['Random Forests','Bagging','Extra Trees','AdaBoost','Gradient Boost']
36+
return classifier_list, classifier_name_list
37+
38+
39+
def get_linear_model():
40+
elastic_net = ElasticNet()
41+
return [elastic_net],['Elastic Net']
42+
43+
44+
def print_evaluation_metrics(trained_model,trained_model_name,X_test,y_test):
45+
print '--------- For Model : ', trained_model_name ,' ---------\n'
46+
predicted_values = trained_model.predict(X_test)
47+
print "Mean Absolute Error : ", metrics.mean_absolute_error(y_test,predicted_values)
48+
print "Median Absolute Error : ", metrics.median_absolute_error(y_test,predicted_values)
49+
print "Mean Squared Error : ", metrics.mean_squared_error(y_test,predicted_values)
50+
print "R2 Score : ", metrics.r2_score(y_test,predicted_values)
51+
print "---------------------------------------\n"
52+
53+
54+
def label_encode_frame(dataframe):
55+
columns = dataframe.columns
56+
encoder = LabelEncoder()
57+
for column in columns:
58+
if type(dataframe[column][0]) is np.nan:
59+
for i in range(len(dataframe)):
60+
if i > 1000:
61+
break
62+
if type(dataframe[column][i]) is str or type(dataframe[column][i]) is bool:
63+
dataframe[column] = encoder.fit_transform(dataframe[column].values)
64+
break
65+
elif type(dataframe[column][0]) is str or type(dataframe[column][0]) is bool:
66+
dataframe[column] = encoder.fit_transform(dataframe[column].values)
67+
return dataframe
68+
69+
70+
def spilt_date(list_of_date_string,separator='-',format='yyyy-mm-dd'):
71+
month_list = list([])
72+
day_list = list([])
73+
year_list = list([])
74+
for date_string in list_of_date_string:
75+
date_list = date_string.strip().split(separator)
76+
month_list.append(int(date_list[1]))
77+
day_list.append(int(date_list[2]))
78+
year_list.append(int(date_list[0]))
79+
return month_list,day_list,year_list
80+
81+
82+
train_filename = 'train.csv'
83+
feature_filename = 'features.csv'
84+
train_frame = pd.read_csv(train_filename)
85+
feature_frame = pd.read_csv(feature_filename)
86+
columns_to_delete = ['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5','Date','IsHoliday']
87+
feature_frame.drop(columns_to_delete,axis=1,inplace=True)
88+
merged_frame = pd.merge(train_frame,feature_frame,left_on='Store',right_on='Store')
89+
target_values = list(merged_frame['Weekly_Sales'].values)
90+
merged_frame['Month'], merged_frame['Day'], merged_frame['Year'] = spilt_date(list(merged_frame['Date'].values))
91+
del merged_frame['Date']
92+
del merged_frame['Weekly_Sales']
93+
merged_frame = label_encode_frame(merged_frame)
94+
print len(merged_frame)
95+
X_train,X_test,y_train,y_test = train_test_split(merged_frame.values,target_values,test_size=0.2,random_state=42)
96+
regressor_list,regressor_name_list = get_ensemble_models()
97+
for regressor,regressor_name in zip(regressor_list,regressor_name_list):
98+
regressor.fit(X_train,y_train)
99+
print_evaluation_metrics(regressor,regressor_name,X_test,y_test)
100+
101+

0 commit comments

Comments
 (0)