Skip to content

Commit 02e93fa

Browse files
committed
a framework for stacking
1 parent a309080 commit 02e93fa

File tree

1 file changed

+161
-0
lines changed

1 file changed

+161
-0
lines changed

python/Stacking.py

+161
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
# -*- coding: utf-8 -*-
2+
# Created on Sun Jan 21 2018 18:53:27
3+
# Author: WuLC
4+
5+
6+
#####################################################
7+
# This script offers a framework for stacking, a ensemble method in machine learning
8+
#####################################################
9+
10+
import numpy as np
11+
from sklearn.model_selection import KFold
12+
13+
14+
# load train data and test data as ndarray
15+
# a m×n ndarray means that there are m samples, while each sample has n dimension feature
16+
x_train_file = './data/selectedFeatures/X_train_select.npy'
17+
y_train_file = './data/selectedFeatures/label.npy'
18+
x_test_file = './data/selectedFeatures/X_test_select.npy'
19+
x_train = np.load(x_train_file).astype(np.float)
20+
y_train = np.load(y_train_file).astype(np.float)
21+
x_test = np.load(x_test_file).astype(np.float)
22+
print(x_train.shape, y_train.shape, x_test.shape)
23+
24+
25+
class BasicModel(object):
26+
"""Parent class of basic models"""
27+
def train(self, x_train, y_train, x_val, y_val):
28+
"""return a trained model and eval metric o validation data"""
29+
pass
30+
31+
def predict(self, model, x_test):
32+
"""return the predicted result"""
33+
pass
34+
35+
def get_oof(self, x_train, y_train, x_test, n_folds = 5):
36+
"""K-fold stacking"""
37+
num_train, num_test = x_train.shape[0], x_test.shape[0]
38+
oof_train = np.zeros((num_train,))
39+
oof_test = np.zeros((num_test,))
40+
oof_test_all_fold = np.zeros((num_test, n_folds))
41+
aucs = []
42+
KF = KFold(n_splits = n_folds, random_state=2017)
43+
for i, (train_index, val_index) in enumerate(KF.split(x_train)):
44+
print('{0} fold, train {1}, val {2}'.format(i, len(train_index), len(val_index)))
45+
x_tra, y_tra = x_train[train_index], y_train[train_index]
46+
x_val, y_val = x_train[val_index], y_train[val_index]
47+
model, auc = self.train(x_tra, y_tra, x_val, y_val)
48+
aucs.append(auc)
49+
oof_train[val_index] = self.predict(model, x_val)
50+
oof_test_all_fold[:, i] = self.predict(model, x_test)
51+
oof_test = np.mean(oof_test_all_fold, axis=1)
52+
print('all aucs {0}, average {1}'.format(aucs, np.mean(aucs)))
53+
return oof_train, oof_test
54+
55+
56+
# create two models for first-layer stacking: xgb and lgb
57+
import xgboost as xgb
58+
class XGBClassifier(BasicModel):
59+
def __init__(self):
60+
"""set parameters"""
61+
self.num_rounds=1000
62+
self.early_stopping_rounds = 15
63+
self.params = {
64+
'objective': 'binary:logistic',
65+
'eta': 0.1,
66+
'max_depth': 8,
67+
'eval_metric': 'auc',
68+
'seed': 0,
69+
'silent' : 0
70+
}
71+
72+
def train(self, x_train, y_train, x_val, y_val):
73+
print('train with xgb model')
74+
xgbtrain = xgb.DMatrix(x_train, y_train)
75+
xgbval = xgb.DMatrix(x_val, y_val)
76+
watchlist = [(xgbtrain,'train'), (xgbval, 'val')]
77+
model = xgb.train(self.params,
78+
xgbtrain,
79+
self.num_rounds)
80+
watchlist,
81+
early_stopping_rounds = self.early_stopping_rounds)
82+
return model, float(model.eval(xgbval).split()[1].split(':')[1])
83+
84+
def predict(self, model, x_test):
85+
print('test with xgb model')
86+
xgbtest = xgb.DMatrix(x_test)
87+
return model.predict(xgbtest)
88+
89+
import lightgbm as lgb
90+
class LGBClassifier(BasicModel):
91+
def __init__(self):
92+
self.num_boost_round = 2000
93+
self.early_stopping_rounds = 15
94+
self.params = {
95+
'task': 'train',
96+
'boosting_type': 'dart',
97+
'objective': 'binary',
98+
'metric': {'auc', 'binary_logloss'},
99+
'num_leaves': 80,
100+
'learning_rate': 0.05,
101+
# 'scale_pos_weight': 1.5,
102+
'feature_fraction': 0.5,
103+
'bagging_fraction': 1,
104+
'bagging_freq': 5,
105+
'max_bin': 300,
106+
'is_unbalance': True,
107+
'lambda_l2': 5.0,
108+
'verbose' : -1
109+
}
110+
111+
def train(self, x_train, y_train, x_val, y_val):
112+
print('train with lgb model')
113+
lgbtrain = lgb.Dataset(x_train, y_train)
114+
lgbval = lgb.Dataset(x_val, y_val)
115+
model = lgb.train(self.params,
116+
lgbtrain,
117+
valid_sets = lgbval,
118+
verbose_eval = self.num_boost_round,
119+
num_boost_round = self.num_boost_round)
120+
early_stopping_rounds = self.early_stopping_rounds)
121+
return model, model.best_score['valid_0']['auc']
122+
123+
def predict(self, model, x_test):
124+
print('test with lgb model')
125+
return model.predict(x_test, num_iteration=model.best_iteration)
126+
127+
128+
# get output of first layer models and construct as input for the second layer
129+
lgb_classifier = LGBClassifier()
130+
lgb_oof_train, lgb_oof_test = lgb_classifier.get_oof(x_train, y_train, x_test)
131+
print(lgb_oof_train.shape, lgb_oof_test.shape)
132+
133+
xgb_classifier = XGBClassifier()
134+
xgb_oof_train, xgb_oof_test = xgb_classifier.get_oof(x_train, y_train, x_test)
135+
print(xgb_oof_train.shape, xgb_oof_test.shape)
136+
137+
input_train = [xgb_oof_train, lgb_oof_train]
138+
input_test = [xgb_oof_test, lgb_oof_test]
139+
140+
stacked_train = np.concatenate([f.reshape(-1, 1) for f in input_train], axis=1)
141+
stacked_test = np.concatenate([f.reshape(-1, 1) for f in input_test], axis=1)
142+
print(stacked_train.shape, stacked_test.shape)
143+
144+
145+
# use LR as the model of the second layer
146+
from sklearn.linear_model import LinearRegression
147+
from sklearn import metrics
148+
149+
# split for validation
150+
n = int(stacked_train.shape[0] * 0.8)
151+
x_tra, y_tra = stacked_train[:n], y_train[:n]
152+
x_val, y_val = stacked_train[n:], y_train[n:]
153+
model = LinearRegression()
154+
model.fit(x_tra,y_tra)
155+
y_pred = model.predict(x_val)
156+
print(metrics.roc_auc_score(y_val, y_pred))
157+
158+
# predict on test data
159+
final_model = LinearRegression()
160+
final_model.fit(stacked_train, y_train)
161+
test_prediction = final_model.predict(stacked_test)

0 commit comments

Comments
 (0)