1
+ # -*- coding: utf-8 -*-
2
+ # Created on Sun Jan 21 2018 18:53:27
3
+ # Author: WuLC
4
+
5
+
6
+ #####################################################
7
+ # This script offers a framework for stacking, a ensemble method in machine learning
8
+ #####################################################
9
+
10
+ import numpy as np
11
+ from sklearn .model_selection import KFold
12
+
13
+
14
+ # load train data and test data as ndarray
15
+ # a m×n ndarray means that there are m samples, while each sample has n dimension feature
16
+ x_train_file = './data/selectedFeatures/X_train_select.npy'
17
+ y_train_file = './data/selectedFeatures/label.npy'
18
+ x_test_file = './data/selectedFeatures/X_test_select.npy'
19
+ x_train = np .load (x_train_file ).astype (np .float )
20
+ y_train = np .load (y_train_file ).astype (np .float )
21
+ x_test = np .load (x_test_file ).astype (np .float )
22
+ print (x_train .shape , y_train .shape , x_test .shape )
23
+
24
+
25
+ class BasicModel (object ):
26
+ """Parent class of basic models"""
27
+ def train (self , x_train , y_train , x_val , y_val ):
28
+ """return a trained model and eval metric o validation data"""
29
+ pass
30
+
31
+ def predict (self , model , x_test ):
32
+ """return the predicted result"""
33
+ pass
34
+
35
+ def get_oof (self , x_train , y_train , x_test , n_folds = 5 ):
36
+ """K-fold stacking"""
37
+ num_train , num_test = x_train .shape [0 ], x_test .shape [0 ]
38
+ oof_train = np .zeros ((num_train ,))
39
+ oof_test = np .zeros ((num_test ,))
40
+ oof_test_all_fold = np .zeros ((num_test , n_folds ))
41
+ aucs = []
42
+ KF = KFold (n_splits = n_folds , random_state = 2017 )
43
+ for i , (train_index , val_index ) in enumerate (KF .split (x_train )):
44
+ print ('{0} fold, train {1}, val {2}' .format (i , len (train_index ), len (val_index )))
45
+ x_tra , y_tra = x_train [train_index ], y_train [train_index ]
46
+ x_val , y_val = x_train [val_index ], y_train [val_index ]
47
+ model , auc = self .train (x_tra , y_tra , x_val , y_val )
48
+ aucs .append (auc )
49
+ oof_train [val_index ] = self .predict (model , x_val )
50
+ oof_test_all_fold [:, i ] = self .predict (model , x_test )
51
+ oof_test = np .mean (oof_test_all_fold , axis = 1 )
52
+ print ('all aucs {0}, average {1}' .format (aucs , np .mean (aucs )))
53
+ return oof_train , oof_test
54
+
55
+
56
+ # create two models for first-layer stacking: xgb and lgb
57
+ import xgboost as xgb
58
+ class XGBClassifier (BasicModel ):
59
+ def __init__ (self ):
60
+ """set parameters"""
61
+ self .num_rounds = 1000
62
+ self .early_stopping_rounds = 15
63
+ self .params = {
64
+ 'objective' : 'binary:logistic' ,
65
+ 'eta' : 0.1 ,
66
+ 'max_depth' : 8 ,
67
+ 'eval_metric' : 'auc' ,
68
+ 'seed' : 0 ,
69
+ 'silent' : 0
70
+ }
71
+
72
+ def train (self , x_train , y_train , x_val , y_val ):
73
+ print ('train with xgb model' )
74
+ xgbtrain = xgb .DMatrix (x_train , y_train )
75
+ xgbval = xgb .DMatrix (x_val , y_val )
76
+ watchlist = [(xgbtrain ,'train' ), (xgbval , 'val' )]
77
+ model = xgb .train (self .params ,
78
+ xgbtrain ,
79
+ self .num_rounds )
80
+ watchlist ,
81
+ early_stopping_rounds = self .early_stopping_rounds )
82
+ return model , float (model .eval (xgbval ).split ()[1 ].split (':' )[1 ])
83
+
84
+ def predict (self , model , x_test ):
85
+ print ('test with xgb model' )
86
+ xgbtest = xgb .DMatrix (x_test )
87
+ return model .predict (xgbtest )
88
+
89
+ import lightgbm as lgb
90
+ class LGBClassifier (BasicModel ):
91
+ def __init__ (self ):
92
+ self .num_boost_round = 2000
93
+ self .early_stopping_rounds = 15
94
+ self .params = {
95
+ 'task' : 'train' ,
96
+ 'boosting_type' : 'dart' ,
97
+ 'objective' : 'binary' ,
98
+ 'metric' : {'auc' , 'binary_logloss' },
99
+ 'num_leaves' : 80 ,
100
+ 'learning_rate' : 0.05 ,
101
+ # 'scale_pos_weight': 1.5,
102
+ 'feature_fraction' : 0.5 ,
103
+ 'bagging_fraction' : 1 ,
104
+ 'bagging_freq' : 5 ,
105
+ 'max_bin' : 300 ,
106
+ 'is_unbalance' : True ,
107
+ 'lambda_l2' : 5.0 ,
108
+ 'verbose' : - 1
109
+ }
110
+
111
+ def train (self , x_train , y_train , x_val , y_val ):
112
+ print ('train with lgb model' )
113
+ lgbtrain = lgb .Dataset (x_train , y_train )
114
+ lgbval = lgb .Dataset (x_val , y_val )
115
+ model = lgb .train (self .params ,
116
+ lgbtrain ,
117
+ valid_sets = lgbval ,
118
+ verbose_eval = self .num_boost_round ,
119
+ num_boost_round = self .num_boost_round )
120
+ early_stopping_rounds = self .early_stopping_rounds )
121
+ return model , model .best_score ['valid_0' ]['auc' ]
122
+
123
+ def predict (self , model , x_test ):
124
+ print ('test with lgb model' )
125
+ return model .predict (x_test , num_iteration = model .best_iteration )
126
+
127
+
128
+ # get output of first layer models and construct as input for the second layer
129
+ lgb_classifier = LGBClassifier ()
130
+ lgb_oof_train , lgb_oof_test = lgb_classifier .get_oof (x_train , y_train , x_test )
131
+ print (lgb_oof_train .shape , lgb_oof_test .shape )
132
+
133
+ xgb_classifier = XGBClassifier ()
134
+ xgb_oof_train , xgb_oof_test = xgb_classifier .get_oof (x_train , y_train , x_test )
135
+ print (xgb_oof_train .shape , xgb_oof_test .shape )
136
+
137
+ input_train = [xgb_oof_train , lgb_oof_train ]
138
+ input_test = [xgb_oof_test , lgb_oof_test ]
139
+
140
+ stacked_train = np .concatenate ([f .reshape (- 1 , 1 ) for f in input_train ], axis = 1 )
141
+ stacked_test = np .concatenate ([f .reshape (- 1 , 1 ) for f in input_test ], axis = 1 )
142
+ print (stacked_train .shape , stacked_test .shape )
143
+
144
+
145
+ # use LR as the model of the second layer
146
+ from sklearn .linear_model import LinearRegression
147
+ from sklearn import metrics
148
+
149
+ # split for validation
150
+ n = int (stacked_train .shape [0 ] * 0.8 )
151
+ x_tra , y_tra = stacked_train [:n ], y_train [:n ]
152
+ x_val , y_val = stacked_train [n :], y_train [n :]
153
+ model = LinearRegression ()
154
+ model .fit (x_tra ,y_tra )
155
+ y_pred = model .predict (x_val )
156
+ print (metrics .roc_auc_score (y_val , y_pred ))
157
+
158
+ # predict on test data
159
+ final_model = LinearRegression ()
160
+ final_model .fit (stacked_train , y_train )
161
+ test_prediction = final_model .predict (stacked_test )
0 commit comments