@@ -59,37 +59,47 @@ def feature_utility(data, selected_feature_name, target_name):
59
59
plt .legend ([bar [0 ] for bar in bars ], target_classes , loc = 'best' )
60
60
plt .show ()
61
61
62
+ def encode_label (data ):
63
+ la_en = preprocessing .LabelEncoder ()
64
+ for col in ['job' , 'marital' , 'education' , 'default' , 'housing' , 'loan' ,
65
+ 'contact' , 'month' , 'poutcome' , 'y' ]:
66
+ data [col ] = bank_data [col ].astype ('category' )
67
+ data [col ] = la_en .fit_transform (bank_data [col ])
68
+ return data
69
+
62
70
dataset_path = ['bank.csv' , 'bank-full.csv' ]
63
71
bank_data = pd .read_csv (dataset_path [1 ], sep = ';' )
72
+ print (bank_data .head ())
64
73
65
74
# good categorical features: job, marital, education, housing, loan, contact, month, poutcome
66
75
# bad categorical features: default
67
76
# feature_utility(bank_data, 'housing', 'y')
68
77
69
- le_en = preprocessing .LabelEncoder ()
70
- for col in ['job' , 'marital' , 'education' , 'default' , 'housing' , 'loan' ,
71
- 'contact' , 'month' , 'poutcome' , 'y' ]:
72
- bank_data [col ] = bank_data [col ].astype ('category' )
73
- bank_data [col ] = le_en .fit_transform (bank_data [col ])
78
+ bank_data = encode_label (bank_data )
74
79
# print(bank_data.dtypes)
75
80
# print(bank_data.head())
76
81
77
82
X_data , y_data = bank_data .iloc [:, :- 1 ], bank_data .iloc [:, - 1 ]
83
+ # show the percentage of answer yes and no.
84
+ answer_no , answer_yes = y_data .value_counts ()
85
+ print ('Percentage of answering no: ' , answer_no / (answer_no + answer_yes ))
86
+
78
87
X_train , X_test , y_train , y_test = train_test_split (
79
88
X_data , y_data ,
80
89
test_size = 0.2 )
81
90
82
- dt_clf = DecisionTreeClassifier ()
83
- rf_clf = RandomForestClassifier ()
91
+ dt_clf = DecisionTreeClassifier (class_weight = 'balanced' ,)
92
+ rf_clf = RandomForestClassifier (class_weight = 'balanced' )
93
+ # randomize the data, and run the cross validation for 5 times
84
94
cv = ShuffleSplit (X_data .shape [0 ], n_iter = 5 ,
85
- test_size = 0.2 , random_state = 0 )
86
- print (cross_val_score (dt_clf , X_data , y_data , cv = cv , scoring = 'accuracy ' ).mean ())
87
- print (cross_val_score (rf_clf , X_data , y_data , cv = cv , scoring = 'accuracy ' ).mean ())
88
-
89
- dt_clf .fit (X_train , y_train )
90
- print (dt_clf .score (X_test , y_test ))
91
- rf_clf .fit (X_train , y_train )
92
- print (rf_clf .score (X_test , y_test ))
95
+ test_size = 0.3 , random_state = 0 )
96
+ print (cross_val_score (dt_clf , X_data , y_data , cv = cv , scoring = 'f1 ' ).mean ())
97
+ print (cross_val_score (rf_clf , X_data , y_data , cv = cv , scoring = 'f1 ' ).mean ())
98
+
99
+ # dt_clf.fit(X_train, y_train)
100
+ # print(dt_clf.score(X_test, y_test))
101
+ # rf_clf.fit(X_train, y_train)
102
+ # print(rf_clf.score(X_test, y_test))
93
103
94
104
# print(rf_clf.predict(X_test.iloc[10, :][np.newaxis, :]))
95
105
# print(y_test.iloc[10])
0 commit comments