Skip to content

Commit

Permalink
HEX-1853: fix training_duration_in_ms for GLM2, and tighten up the te…
Browse files Browse the repository at this point in the history
…sting of the time (was checking > 0, which didn't catch the case in which the start time was never set).
  • Loading branch information
rpeck committed Aug 12, 2014
1 parent b1bb866 commit fa7a6bc
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 3 deletions.
31 changes: 28 additions & 3 deletions py/testdir_multi_jvm/test_model_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,10 +167,12 @@ def create_models(self, frame_keys):
node = h2o.nodes[0]

num_models = 0
durations = {}

print "##############################################################"
print "Generating AirlinesTrain GLM2 binary classification model. . ."
# R equivalent: h2o.glm.FV(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, family = "binomial", alpha=0.05, lambda=1.0e-2, standardize=FALSE, nfolds=0)
before = time.time() * 1000
glm_AirlinesTrain_1_params = {
'destination_key': 'glm_AirlinesTrain_binary_1',
'response': 'IsDepDelayed',
Expand All @@ -183,13 +185,15 @@ def create_models(self, frame_keys):
'use_all_factor_levels': 1
}
glm_AirlinesTrain_1 = node.GLM(airlines_train_hex, **glm_AirlinesTrain_1_params)
durations['glm_AirlinesTrain_binary_1'] = time.time() * 1000 - before
num_models = num_models + 1
h2o_glm.simpleCheckGLM(self, glm_AirlinesTrain_1, None, **glm_AirlinesTrain_1_params)


print "####################################################################"
print "Generating AirlinesTrain simple GBM binary classification model. . ."
# R equivalent: h2o.gbm(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, n.trees=3, interaction.depth=1, distribution="multinomial", n.minobsinnode=2, shrinkage=.1)
before = time.time() * 1000
gbm_AirlinesTrain_1_params = {
'destination_key': 'gbm_AirlinesTrain_binary_1',
'response': 'IsDepDelayed',
Expand All @@ -200,12 +204,14 @@ def create_models(self, frame_keys):
# TODO: what about minobsinnode and shrinkage?!
}
gbm_AirlinesTrain_1 = node.gbm(airlines_train_hex, **gbm_AirlinesTrain_1_params)
durations['gbm_AirlinesTrain_binary_1'] = time.time() * 1000 - before
num_models = num_models + 1


print "#####################################################################"
print "Generating AirlinesTrain complex GBM binary classification model. . ."
# R equivalent: h2o.gbm(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, n.trees=50, interaction.depth=5, distribution="multinomial", n.minobsinnode=2, shrinkage=.1)
before = time.time() * 1000
gbm_AirlinesTrain_2_params = {
'destination_key': 'gbm_AirlinesTrain_binary_2',
'response': 'IsDepDelayed',
Expand All @@ -216,12 +222,14 @@ def create_models(self, frame_keys):
# TODO: what about minobsinnode and shrinkage?!
}
gbm_AirlinesTrain_2 = node.gbm(airlines_train_hex, **gbm_AirlinesTrain_2_params)
durations['gbm_AirlinesTrain_binary_2'] = time.time() * 1000 - before
num_models = num_models + 1


print "####################################################################"
print "Generating AirlinesTrain simple DRF binary classification model. . ."
# R equivalent: h2o.randomForest.FV(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, ntree=5, depth=2)
before = time.time() * 1000
rf_AirlinesTrain_1_params = {
'destination_key': 'rf_AirlinesTrain_binary_1',
'response': 'IsDepDelayed',
Expand All @@ -231,12 +239,14 @@ def create_models(self, frame_keys):
'classification': 1
}
rf_AirlinesTrain_1 = node.random_forest(airlines_train_hex, **rf_AirlinesTrain_1_params)
durations['rf_AirlinesTrain_binary_1'] = time.time() * 1000 - before
num_models = num_models + 1


print "#####################################################################"
print "Generating AirlinesTrain complex DRF binary classification model. . ."
# R equivalent: h2o.randomForest.FV(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, ntree=50, depth=10)
before = time.time() * 1000
rf_AirlinesTrain_2_params = {
'destination_key': 'rf_AirlinesTrain_binary_2',
'response': 'IsDepDelayed',
Expand All @@ -246,12 +256,14 @@ def create_models(self, frame_keys):
'classification': 1
}
rf_AirlinesTrain_2 = node.random_forest(airlines_train_hex, **rf_AirlinesTrain_2_params)
durations['rf_AirlinesTrain_binary_2'] = time.time() * 1000 - before
num_models = num_models + 1


print "#####################################################################"
print "Generating AirlinesTrain complex SpeeDRF binary classification model. . ."
# what is the R binding?
before = time.time() * 1000
speedrf_AirlinesTrain_1_params = {
'destination_key': 'speedrf_AirlinesTrain_binary_1',
'response': 'IsDepDelayed',
Expand All @@ -262,12 +274,14 @@ def create_models(self, frame_keys):
}
# Fails to complete in multinode
speedrf_AirlinesTrain_1 = node.speedrf(airlines_train_hex, **speedrf_AirlinesTrain_1_params)
durations['speedrf_AirlinesTrain_binary_1'] = time.time() * 1000 - before
num_models = num_models + 1


print "######################################################################"
print "Generating AirlinesTrain DeepLearning binary classification model. . ."
# R equivalent: h2o.deeplearning(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, classification=TRUE, hidden=c(10, 10))
before = time.time() * 1000
dl_AirlinesTrain_1_params = {
'destination_key': 'dl_AirlinesTrain_binary_1',
'response': 'IsDepDelayed',
Expand All @@ -277,12 +291,14 @@ def create_models(self, frame_keys):
'variable_importances': 1
}
dl_AirlinesTrain_1 = node.deep_learning(airlines_train_hex, **dl_AirlinesTrain_1_params)
durations['dl_AirlinesTrain_binary_1'] = time.time() * 1000 - before
num_models = num_models + 1


print "##############################################################################################"
print "Generating AirlinesTrain GLM2 binary classification model with different response column. . ."
# R equivalent: h2o.glm.FV(y = "IsDepDelayed_REC", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, family = "binomial", alpha=0.05, lambda=1.0e-2, standardize=FALSE, nfolds=0)
before = time.time() * 1000
glm_AirlinesTrain_A_params = {
'destination_key': 'glm_AirlinesTrain_binary_A',
'response': 'IsDepDelayed_REC_recoded',
Expand All @@ -295,13 +311,15 @@ def create_models(self, frame_keys):
'use_all_factor_levels': 1
}
glm_AirlinesTrain_A = node.GLM(airlines_train_hex, **glm_AirlinesTrain_A_params)
durations['glm_AirlinesTrain_binary_A'] = time.time() * 1000 - before
num_models = num_models + 1
h2o_glm.simpleCheckGLM(self, glm_AirlinesTrain_A, None, **glm_AirlinesTrain_A_params)


print "#########################################################"
print "Generating Prostate GLM2 binary classification model. . ."
# R equivalent: h2o.glm.FV(y = "CAPSULE", x = c("AGE","RACE","PSA","DCAPS"), data = prostate.hex, family = "binomial", nfolds = 0, alpha = 0.5)
before = time.time() * 1000
glm_Prostate_1_params = {
'destination_key': 'glm_Prostate_binary_1',
'response': 'CAPSULE',
Expand All @@ -312,13 +330,15 @@ def create_models(self, frame_keys):
'use_all_factor_levels': 0 # should get warning about variable importances!
}
glm_Prostate_1 = node.GLM(prostate_hex, **glm_Prostate_1_params)
durations['glm_Prostate_binary_1'] = time.time() * 1000 - before
num_models = num_models + 1
h2o_glm.simpleCheckGLM(self, glm_Prostate_1, None, **glm_Prostate_1_params)


print "###############################################################"
print "Generating Prostate simple DRF binary classification model. . ."
# R equivalent: h2o.randomForest.FV(y = "CAPSULE", x = c("AGE","RACE","DCAPS"), data = prostate.hex, ntree=10, depth=5)
before = time.time() * 1000
rf_Prostate_1_params = {
'destination_key': 'rf_Prostate_binary_1',
'response': 'CAPSULE',
Expand All @@ -328,11 +348,13 @@ def create_models(self, frame_keys):
'classification': 1
}
rf_Prostate_1 = node.random_forest(prostate_hex, **rf_Prostate_1_params)
durations['rf_Prostate_binary_1'] = time.time() * 1000 - before
num_models = num_models + 1


print "#####################################################################"
print "Generating Prostate complex SpeeDRF binary classification model. . ."
before = time.time() * 1000
speedrf_Prostate_1_params = {
'destination_key': 'speedrf_Prostate_binary_1',
'response': 'CAPSULE',
Expand All @@ -341,14 +363,15 @@ def create_models(self, frame_keys):
'max_depth': 10,
'classification': 1
}
# TODO: put back; fails to complete in multinode
# speedrf_Prostate_1 = node.speedrf(prostate_hex, **speedrf_Prostate_1_params)
# num_models = num_models + 1
speedrf_Prostate_1 = node.speedrf(prostate_hex, **speedrf_Prostate_1_params)
num_models = num_models + 1
durations['speedrf_Prostate_binary_1'] = time.time() * 1000 - before


print "##############################################"
print "Generating Prostate GLM2 regression model. . ."
# R equivalent: h2o.glm.FV(y = "AGE", x = c("CAPSULE","RACE","PSA","DCAPS"), data = prostate.hex, family = "gaussian", nfolds = 0, alpha = 0.5)
before = time.time() * 1000
glm_Prostate_regression_1_params = {
'destination_key': 'glm_Prostate_regression_1',
'response': 'AGE',
Expand All @@ -359,6 +382,7 @@ def create_models(self, frame_keys):
'use_all_factor_levels': 1
}
glm_Prostate_regression_1 = node.GLM(prostate_hex, **glm_Prostate_regression_1_params)
durations['glm_Prostate_regression_1'] = time.time() * 1000 - before
num_models = num_models + 1
h2o_glm.simpleCheckGLM(self, glm_Prostate_regression_1, None, **glm_Prostate_regression_1_params)

Expand All @@ -381,6 +405,7 @@ def create_models(self, frame_keys):

for key, value in models['models'].iteritems():
self.assertEquals(value['state'], 'DONE', "Expected state to be DONE for model: " + key)
self.assertTrue(value['training_duration_in_ms'] < durations[key], "Expected training duration as computed by the server (" + str(value['training_duration_in_ms']) + ") to be less than we compute in the test (" + str(durations[key]) + ") for model: " + key)
self.assertNotEqual(found_problem, True, "Missing models on at least one node.")


Expand Down
3 changes: 3 additions & 0 deletions src/main/java/hex/glm/GLM2.java
Original file line number Diff line number Diff line change
Expand Up @@ -794,6 +794,7 @@ public void compute2() {
LogInfo("GLM " + self() + " completed by " + cmp.getClass().getName() + ", " + cmp.toString());
assert _cmp.compareAndSet(null, cmp) : "double completion, first from " + _cmp.get().getClass().getName() + ", second from " + cmp.getClass().getName();
_done = true;
// TODO: move these updates to Model into a DKeyTask so that it runs remotely on the model's home
GLMModel model = DKV.get(dest()).get();
model.maybeComputeVariableImportances();
model.stop_training();
Expand Down Expand Up @@ -905,6 +906,7 @@ public void run(boolean doLog, H2OCountedCompleter cmp){
// just fork off the nfolds+1 tasks and wait for the results
assert alpha.length == 1;
start_time = System.currentTimeMillis();

if(nlambdas == -1)nlambdas = 100;
if(lambda_search && nlambdas <= 1)
throw new IllegalArgumentException(LogInfo("GLM2: nlambdas must be > 1 when running with lambda search."));
Expand Down Expand Up @@ -945,6 +947,7 @@ public String toString(){
_lastResult = new IterationInfo(0,t,null,t.gradient(0,0));

GLMModel model = new GLMModel(GLM2.this, dest(), _dinfo, _glm, beta_epsilon, alpha[0], lambda_max, _ymu, prior);
model.start_training(start_time);
if(lambda_search) {
assert !Double.isNaN(lambda_max) : LogInfo("running lambda_value search, but don't know what is the lambda_value max!");
model = addLmaxSubmodel(model, t._val);
Expand Down

0 comments on commit fa7a6bc

Please sign in to comment.