-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
96 lines (87 loc) · 4.56 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import numpy as np #numerical computation
import pandas as pd #data wrangling
#Next line helps with rendering plots
#to visualize decision trees
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pickle
df_orig = pd.read_excel("default_of_credit_card_clients.xls")
df_zero_mask = df_orig == 0
feature_zero_mask = df_zero_mask.iloc[:,1:].all(axis=1)
sum(feature_zero_mask)
df_clean = df_orig.loc[~feature_zero_mask,:].copy()
df_clean['EDUCATION'].replace(to_replace=[0, 5, 6], value=4, inplace=True)
df_clean['MARRIAGE'].replace(to_replace=0, value=3, inplace=True)
missing_pay_1_mask = df_clean['PAY_1'] == 'Not available'
df_missing_pay_1 = df_clean.loc[missing_pay_1_mask,:].copy()
df = pd.read_csv("cleaned_data.csv")
features_response = df.columns.tolist()
items_to_remove = ['ID', 'SEX', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',
'EDUCATION_CAT', 'graduate school', 'high school', 'none',
'others', 'university']
features_response = [item for item in features_response if item not in items_to_remove]
X_train, X_test, y_train, y_test = \
train_test_split(df[features_response[:-1]].values, df['default payment next month'].values,
test_size=0.2, random_state=24)
np.random.seed(seed=1)
fill_values = [0, np.random.choice(X_train[:,4], size=(3021,), replace=True)]
fill_strategy = ['mode', 'random']
k_folds = KFold(n_splits=4, shuffle=True, random_state=1)
rf = RandomForestClassifier\
(n_estimators=200, criterion='gini', max_depth=9,
min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None,
random_state=4, verbose=1, warm_start=False, class_weight=None)
'''
for counter in range(len(fill_values)):
#Copy the data frame with missing PAY_1 and assign imputed values
df_fill_pay_1_filled = df_missing_pay_1.copy()
df_fill_pay_1_filled['PAY_1'] = fill_values[counter]
#Split imputed data in to training and testing, using the same
#80/20 split we have used for the data with non-missing PAY_1
X_fill_pay_1_train, X_fill_pay_1_test, y_fill_pay_1_train, y_fill_pay_1_test = \
train_test_split(
df_fill_pay_1_filled[features_response[:-1]].values,
df_fill_pay_1_filled['default payment next month'].values,
test_size=0.2, random_state=24)
#Concatenate the imputed data with the array of non-missing data
X_train_all = np.concatenate((X_train, X_fill_pay_1_train), axis=0)
y_train_all = np.concatenate((y_train, y_fill_pay_1_train), axis=0)
#Use the KFolds splitter and the random forest model to get
#4-fold cross-validation scores for both imputation methods
imputation_compare_cv = cross_validate(rf, X_train_all, y_train_all, scoring='roc_auc',
cv=k_folds, n_jobs=-1, verbose=1,
return_train_score=True, return_estimator=True,
error_score='raise-deprecating')
'''
pay_1_df = df.copy()
features_for_imputation = pay_1_df.columns.tolist()
items_to_remove_2 = ['ID', 'SEX', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',
'EDUCATION_CAT', 'graduate school', 'high school', 'none',
'others', 'university', 'default payment next month', 'PAY_1']
features_for_imputation = [item for item in features_for_imputation if item not in items_to_remove_2]
X_impute_train, X_impute_test, y_impute_train, y_impute_test = \
train_test_split(
pay_1_df[features_for_imputation].values,
pay_1_df['PAY_1'].values,
test_size=0.2, random_state=24)
rf_impute_params = {'max_depth':[3, 6, 9, 12],
'n_estimators':[10, 50, 100, 200]}
cv_rf_impute = GridSearchCV(rf, param_grid=rf_impute_params, scoring='accuracy',
n_jobs=-1, iid=False, refit=True,
cv=4, verbose=2, error_score=np.nan, return_train_score=True)
cv_rf_impute.fit(X_impute_train, y_impute_train)
'''check=[50000,2,1,57,-1,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689
]
ch=cv_rf_impute.predict([check])
print(ch)
'''
'''test_score = imputation_compare_cv['test_score']
print(fill_strategy[counter] + ' imputation: ' +
'mean testing score ' + str(np.mean(test_score)) +
', std ' + str(np.std(test_score)))'''
model= pickle.dump(cv_rf_impute,open('Classifier.pkl','wb'))