forked from rupakc/Kaggle-Compendium
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnile-baseline.py
83 lines (69 loc) · 3.34 KB
/
nile-baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
import numpy as np
def get_naive_bayes_models():
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()
classifier_list = [gnb,mnb,bnb]
classifier_name_list = ['Gaussian NB','Multinomial NB','Bernoulli NB']
return classifier_list,classifier_name_list
def get_neural_network(hidden_layer_size=50):
mlp = MLPClassifier(hidden_layer_sizes=hidden_layer_size)
return [mlp], ['MultiLayer Perceptron']
def get_ensemble_models():
rf = RandomForestClassifier(n_estimators=51,min_samples_leaf=5,min_samples_split=3)
bagg = BaggingClassifier(n_estimators=71,random_state=42)
extra = ExtraTreesClassifier(n_estimators=57,random_state=42)
ada = AdaBoostClassifier(n_estimators=51,random_state=42)
grad = GradientBoostingClassifier(n_estimators=101,random_state=42)
classifier_list = [rf,bagg,extra,ada,grad]
classifier_name_list = ['Random Forests','Bagging','Extra Trees','AdaBoost','Gradient Boost']
return classifier_list,classifier_name_list
def label_encode_frame(dataframe):
columns = dataframe.columns
encoder = LabelEncoder()
for column in columns:
if type(dataframe[column][0]) is str:
dataframe[column] = encoder.fit_transform(dataframe[column].values)
return dataframe
def spilt_date(list_of_date_string,separator='-',format='yyyy-mm-dd'):
month_list = list([])
day_list = list([])
year_list = list([])
for date_string in list_of_date_string:
date_list = date_string.strip().split(separator)
month_list.append(int(date_list[1]))
day_list.append(int(date_list[2]))
year_list.append(int(date_list[0]))
return month_list,day_list,year_list
def print_evaluation_metrics(trained_model,trained_model_name,X_test,y_test):
print '--------- For Model : ', trained_model_name,'-----------------'
predicted_values = trained_model.predict(X_test)
print metrics.classification_report(y_test,predicted_values)
print "Accuracy Score : ", metrics.accuracy_score(y_test,predicted_values)
print "---------------------------------------\n"
filename = 'train.csv'
train_frame = pd.read_csv(filename)
train_frame['month'], train_frame['day'], train_frame['year'] = spilt_date(list(train_frame['Date'].values))
target_classes = list(train_frame['WnvPresent'].values)
del train_frame['Date']
del train_frame['WnvPresent']
train_frame = label_encode_frame(train_frame)
X_train,X_test,y_train,y_test = train_test_split(train_frame.values,target_classes,test_size=0.2,random_state=42)
classifier_list,classifier_name_list = get_ensemble_models()
for classifier,classifier_name in zip(classifier_list,classifier_name_list):
classifier.fit(X_train,y_train)
print_evaluation_metrics(classifier,classifier_name,X_test,y_test)