forked from h2oai/h2o-3
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbeating_the_benchmark.R
168 lines (133 loc) · 6.27 KB
/
beating_the_benchmark.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
######################################################################
## Beating the Benchmark with H2O - LB Score: 0.50103
## R script by Arno Candel @ArnoCandel
## https://www.kaggle.com/users/234686/arno-candel
## More information at http://h2o.ai/
## Source code: http://github.com/h2oai/h2o-dev/
######################################################################
######################################################################
## Step 1 - Download and Install H2O
######################################################################
# The following two commands remove any previously installed H2O packages for R.
if ("package:h2o" %in% search()) { detach("package:h2o", unload=TRUE) }
if ("h2o" %in% rownames(installed.packages())) { remove.packages("h2o") }
# Next, we download packages that H2O depends on.
if (! ("methods" %in% rownames(installed.packages()))) { install.packages("methods") }
if (! ("statmod" %in% rownames(installed.packages()))) { install.packages("statmod") }
if (! ("stats" %in% rownames(installed.packages()))) { install.packages("stats") }
if (! ("graphics" %in% rownames(installed.packages()))) { install.packages("graphics") }
if (! ("RCurl" %in% rownames(installed.packages()))) { install.packages("RCurl") }
if (! ("jsonlite" %in% rownames(installed.packages()))) { install.packages("jsonlite") }
if (! ("tools" %in% rownames(installed.packages()))) { install.packages("tools") }
if (! ("utils" %in% rownames(installed.packages()))) { install.packages("utils") }
# Now we download, install and initialize the H2O package for R.
install.packages("h2o", type="source", repos=(c("http://h2o-release.s3.amazonaws.com/h2o-dev/master/1179/R")))
######################################################################
## Step 2 - Launch H2O
######################################################################
## Load h2o R module
library(h2o)
## Launch h2o on localhost, using all cores
h2oServer = h2o.init(nthreads=-1)
## Point to directory where the Kaggle data is
dir <- paste0(path.expand("~"), "/h2o-kaggle/otto/")
## For Spark/Hadoop/YARN/Standalone operation on a cluster, follow instructions on http://h2o.ai/download/
## Then connect to any cluster node from R
#h2oServer = h2o.init(ip="mr-0xd1",port=53322)
#dir <- "hdfs://mr-0xd6/users/arno/h2o-kaggle/otto/"
######################################################################
## Step 3 - Import Data and create Train/Validation Splits
######################################################################
train.hex <- h2o.importFile(paste0(dir,"train.csv"), destination_frame="train.hex")
test.hex <- h2o.importFile(paste0(dir, "test.csv"), destination_frame="test.hex")
dim(train.hex)
summary(train.hex)
predictors <- 2:(ncol(train.hex)-1) #ignore first column 'id'
response <- ncol(train.hex)
## Split into 80/20 Train/Validation
rnd <- h2o.runif(train.hex, 1234)
train_holdout.hex <- h2o.assign(train.hex[rnd<0.8,], "train_holdout.hex")
valid_holdout.hex <- h2o.assign(train.hex[rnd>=0.8,], "valid_holdout.hex")
######################################################################
## Step 4 - Use H2O Flow to inspect the data and build some models on
## train_holdout.hex/valid_holdout.hex to get a feeling for the problem
######################################################################
## Connect browser to http://localhost:54321 (or http://cluster-node-ip:port)
######################################################################
## Step 5 - GBM Hyper-Parameter Tuning with Random Search
######################################################################
models <- c()
for (i in 1:10) {
rand_numtrees <- sample(1:50,1) ## 1 to 50 trees
rand_max_depth <- sample(5:15,1) ## 5 to 15 max depth
rand_min_rows <- sample(1:10,1) ## 1 to 10 min rows
rand_learn_rate <- 0.025*sample(1:10,1) ## 0.025 to 0.25 learning rate
model_name <- paste0("GBMModel_",i,
"_ntrees",rand_numtrees,
"_maxdepth",rand_max_depth,
"_minrows",rand_min_rows,
"_learnrate",rand_learn_rate
)
model <- h2o.gbm(x=predictors,
y=response,
training_frame=train_holdout.hex,
validation_frame=valid_holdout.hex,
model_id=model_name,
distribution="multinomial",
ntrees=rand_numtrees,
max_depth=rand_max_depth,
min_rows=rand_min_rows,
learn_rate=rand_learn_rate
)
models <- c(models, model)
}
## Find the best model (lowest logloss on the validation holdout set)
best_err <- 1e3
for (i in 1:length(models)) {
err <- h2o.logloss( h2o.performance(models[[i]], valid_holdout.hex) )
if (err < best_err) {
best_err <- err
best_model <- models[[i]]
}
}
## Show the "winning" parameters
parms <- best_model@allparameters
parms$ntrees
parms$max_depth
parms$min_rows
parms$learn_rate
## Training set performance metrics
train_perf <- h2o.performance(best_model, train_holdout.hex)
h2o.confusionMatrix(train_perf)
h2o.logloss(train_perf)
## Validation set performance metrics
valid_perf <- h2o.performance(best_model, valid_holdout.hex)
h2o.confusionMatrix(valid_perf)
h2o.logloss(valid_perf)
######################################################################
## Step 6 - Build Final Model using the Full Training Data
######################################################################
model <- h2o.gbm(x=predictors,
y=response,
model_id="final_model",
training_frame=train.hex,
distribution="multinomial",
ntrees=42,
max_depth=10,
min_rows=10,
learn_rate=0.175
)
######################################################################
## Step 7 - Make Final Test Set Predictions for Submission
######################################################################
## Predictions: label + 9 per-class probabilities
pred <- predict(model, test.hex)
head(pred)
## Remove label
pred <- pred[,-1]
head(pred)
## Paste the ids (first col of test set) together with the predictions
submission <- h2o.cbind(test.hex[,1], pred)
head(submission)
## Save submission to disk
h2o.exportFile(submission, paste0(dir, "submission.csv"))