Skip to content

Commit

Permalink
Add n-fold cross-validation tests for all the model builders.
Browse files Browse the repository at this point in the history
  • Loading branch information
rpeck committed Aug 18, 2014
1 parent d5c38ca commit 963d5c7
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 25 deletions.
25 changes: 4 additions & 21 deletions py/h2o.py
Original file line number Diff line number Diff line change
Expand Up @@ -1773,6 +1773,7 @@ def speedrf(self, data_key, ntrees=50, max_depth=10, timeoutSecs=300, retryDelay
'select_stat_type': 'ENTROPY',
'importance':0,
'strata_samples': None,
'n_folds': None
}
check_params_update_kwargs(params_dict, kwargs, 'SpeeDRF', print_params)

Expand Down Expand Up @@ -1824,6 +1825,7 @@ def random_forest(self, data_key, trees=None,
'score_each_iteration': None,
'seed': None,
'validation': None,
'n_folds': None
}
if 'model_key' in kwargs:
kwargs['destination_key'] = kwargs['model_key'] # hmm..should we switch test to new param?
Expand Down Expand Up @@ -2179,6 +2181,7 @@ def gbm(self, data_key, timeoutSecs=600, retryDelaySecs=1, initialDelaySecs=5, p
'classification': None,
'score_each_iteration': None,
'grid_parallelism': None,
'n_folds': None,
}

# only lets these params thru
Expand Down Expand Up @@ -2396,6 +2399,7 @@ def deep_learning(self, data_key, timeoutSecs=60, retryDelaySecs=1, initialDelay
'replicate_training_data': None,
'single_node_mode': None,
'shuffle_training_data': None,
'n_folds': None,
}
# only lets these params thru
check_params_update_kwargs(params_dict, kwargs, 'deep_learning', print_params)
Expand Down Expand Up @@ -2557,27 +2561,6 @@ def GLM(self, key,
time.sleep(5)
return a

def GLMGrid(self, key,
timeoutSecs=300, retryDelaySecs=1.0, initialDelaySecs=None, pollTimeoutSecs=180,
noise=None, benchmarkLogging=None, noPoll=False, **kwargs):

a = self.GLM_shared(key, timeoutSecs, retryDelaySecs, initialDelaySecs, parentName="GLMGrid", **kwargs)

if noPoll:
return a

a = self.poll_url(a, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs,
initialDelaySecs=initialDelaySecs, pollTimeoutSecs=pollTimeoutSecs,
noise=noise, benchmarkLogging=benchmarkLogging)
verboseprint("GLMGrid done:", dump_json(a))

browseAlso = kwargs.get('browseAlso', False)
if (browseAlso | browse_json):
print "Viewing the GLM grid result through the browser"
h2b.browseJsonHistoryAsUrlLastMatch('GLMGridProgress')
time.sleep(5)
return a

def GLMGrid_view(self, timeoutSecs=300, print_params=False, **kwargs):
params_dict = {
'job': None,
Expand Down
135 changes: 131 additions & 4 deletions py/testdir_multi_jvm/test_model_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,52 @@ def create_models(self, frame_keys):
h2o_glm.simpleCheckGLM(self, glm_AirlinesTrain_1, None, **glm_AirlinesTrain_1_params)


print "#########################################################################################"
print "Generating AirlinesTrain GLM2 binary classification model with nfold crossvalidation. . ."
# R equivalent: h2o.glm.FV(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, family = "binomial", alpha=0.05, lambda=1.0e-2, standardize=FALSE, nfolds=3)
before = time.time() * 1000
glm_AirlinesTrain_3fold_params = {
'destination_key': 'glm_AirlinesTrain_binary_3fold',
'response': 'IsDepDelayed',
'ignored_cols': 'IsDepDelayed_REC, IsDepDelayed_REC_recoded',
'family': 'binomial',
'alpha': 0.5,
'standardize': 0,
'lambda': 1.0e-2,
'n_folds': 3,
'use_all_factor_levels': 1
}
glm_AirlinesTrain_3fold = node.GLM(airlines_train_hex, **glm_AirlinesTrain_3fold_params)
durations['glm_AirlinesTrain_binary_3fold'] = time.time() * 1000 - before
num_models = num_models + 1 # TODO: interesting that the xval models aren't visible as they are in GBM
h2o_glm.simpleCheckGLM(self, glm_AirlinesTrain_3fold, None, **glm_AirlinesTrain_3fold_params)




# print "##############################################################"
# print "Grid search: Generating AirlinesTrain GLM2 binary classification models. . ."
# before = time.time() * 1000
# glm_AirlinesTrain_grid_params = {
# 'destination_key': 'glm_AirlinesTrain_binary_grid_',
# 'response': 'IsDepDelayed',
# 'ignored_cols': 'IsDepDelayed_REC, IsDepDelayed_REC_recoded',
# 'family': 'binomial',
# 'alpha': '0.5, 1.0',
# 'standardize': 0,
# 'lambda': '1.0e-2,1.0e-3,1.0e-4',
# 'n_folds': 2,
# 'use_all_factor_levels': 1
# }
# glm_AirlinesTrain_grid = node.GLMGrid(airlines_train_hex, **glm_AirlinesTrain_grid_params)
# durations['glm_AirlinesTrain_binary_grid'] = time.time() * 1000 - before
# num_models = num_models + 6
# h2o_glm.simpleCheckGLMGrid(self, glm_AirlinesTrain_grid, None, **glm_AirlinesTrain_grid_params)





print "####################################################################"
print "Generating AirlinesTrain simple GBM binary classification model. . ."
# R equivalent: h2o.gbm(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, n.trees=3, interaction.depth=1, distribution="multinomial", n.minobsinnode=2, shrinkage=.1)
Expand All @@ -200,7 +246,8 @@ def create_models(self, frame_keys):
'ignored_cols_by_name': 'IsDepDelayed_REC, IsDepDelayed_REC_recoded',
'ntrees': 3,
'max_depth': 1,
'classification': 1
'classification': 1,
'n_folds': 0
# TODO: what about minobsinnode and shrinkage?!
}
gbm_AirlinesTrain_1 = node.gbm(airlines_train_hex, **gbm_AirlinesTrain_1_params)
Expand All @@ -218,14 +265,34 @@ def create_models(self, frame_keys):
'ignored_cols_by_name': 'IsDepDelayed_REC, IsDepDelayed_REC_recoded',
'ntrees': 50,
'max_depth': 5,
'classification': 1
'classification': 1,
'n_folds': 0
# TODO: what about minobsinnode and shrinkage?!
}
gbm_AirlinesTrain_2 = node.gbm(airlines_train_hex, **gbm_AirlinesTrain_2_params)
durations['gbm_AirlinesTrain_binary_2'] = time.time() * 1000 - before
num_models = num_models + 1


print "###############################################################################################"
print "Generating AirlinesTrain simple GBM binary classification model with nfold crossvalidation. . ."
# R equivalent: h2o.gbm(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, n.trees=3, interaction.depth=1, distribution="multinomial", n.minobsinnode=2, shrinkage=.1)
before = time.time() * 1000
gbm_AirlinesTrain_3fold_params = {
'destination_key': 'gbm_AirlinesTrain_binary_3fold',
'response': 'IsDepDelayed',
'ignored_cols_by_name': 'IsDepDelayed_REC, IsDepDelayed_REC_recoded',
'ntrees': 3,
'max_depth': 1,
'classification': 1,
'n_folds': 3
# TODO: what about minobsinnode and shrinkage?!
}
gbm_AirlinesTrain_3fold = node.gbm(airlines_train_hex, **gbm_AirlinesTrain_3fold_params)
durations['gbm_AirlinesTrain_binary_3fold'] = time.time() * 1000 - before
num_models = num_models + 4 # 1 main model and 3 xval models


print "####################################################################"
print "Generating AirlinesTrain simple DRF binary classification model. . ."
# R equivalent: h2o.randomForest.FV(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, ntree=5, depth=2)
Expand Down Expand Up @@ -260,6 +327,24 @@ def create_models(self, frame_keys):
num_models = num_models + 1


print "###############################################################################################"
print "Generating AirlinesTrain simple DRF binary classification model with nfold crossvalidation. . ."
# R equivalent: h2o.randomForest.FV(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, ntree=5, depth=2)
before = time.time() * 1000
rf_AirlinesTrain_3fold_params = {
'destination_key': 'rf_AirlinesTrain_binary_3fold',
'response': 'IsDepDelayed',
'ignored_cols_by_name': 'IsDepDelayed_REC, IsDepDelayed_REC_recoded',
'ntrees': 5,
'max_depth': 2,
'classification': 1,
'n_folds': 3
}
rf_AirlinesTrain_3fold = node.random_forest(airlines_train_hex, **rf_AirlinesTrain_3fold_params)
durations['rf_AirlinesTrain_binary_3fold'] = time.time() * 1000 - before
num_models = num_models + 4


print "#####################################################################"
print "Generating AirlinesTrain complex SpeeDRF binary classification model. . ."
# what is the R binding?
Expand All @@ -273,12 +358,30 @@ def create_models(self, frame_keys):
'classification': 1,
'importance': 1
}
# Fails to complete in multinode
speedrf_AirlinesTrain_1 = node.speedrf(airlines_train_hex, **speedrf_AirlinesTrain_1_params)
durations['speedrf_AirlinesTrain_binary_1'] = time.time() * 1000 - before
num_models = num_models + 1


print "####################################################################################################"
print "Generating AirlinesTrain complex SpeeDRF binary classification model with nfold crossvalidation. . ."
# what is the R binding?
before = time.time() * 1000
speedrf_AirlinesTrain_3fold_params = {
'destination_key': 'speedrf_AirlinesTrain_binary_3fold',
'response': 'IsDepDelayed',
'ignored_cols_by_name': 'IsDepDelayed_REC, IsDepDelayed_REC_recoded',
'ntrees': 50,
'max_depth': 10,
'classification': 1,
'importance': 1,
'n_folds': 3
}
speedrf_AirlinesTrain_3fold = node.speedrf(airlines_train_hex, **speedrf_AirlinesTrain_3fold_params)
durations['speedrf_AirlinesTrain_binary_3fold'] = time.time() * 1000 - before
num_models = num_models + 4 # 1 main model and 3 xval models


print "######################################################################"
print "Generating AirlinesTrain DeepLearning binary classification model. . ."
# R equivalent: h2o.deeplearning(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, classification=TRUE, hidden=c(10, 10))
Expand Down Expand Up @@ -317,6 +420,24 @@ def create_models(self, frame_keys):
h2o_glm.simpleCheckGLM(self, glm_AirlinesTrain_A, None, **glm_AirlinesTrain_A_params)


print "#################################################################################################"
print "Generating AirlinesTrain DeepLearning binary classification model with nfold crossvalidation. . ."
# R equivalent: h2o.deeplearning(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, classification=TRUE, hidden=c(10, 10), nfolds=3)
before = time.time() * 1000
dl_AirlinesTrain_3fold_params = {
'destination_key': 'dl_AirlinesTrain_binary_3fold',
'response': 'IsDepDelayed',
'ignored_cols': 'IsDepDelayed_REC, IsDepDelayed_REC_recoded',
'hidden': [10, 10],
'classification': 1,
'variable_importances': 1,
'n_folds': 3
}
dl_AirlinesTrain_3fold = node.deep_learning(airlines_train_hex, **dl_AirlinesTrain_3fold_params)
durations['dl_AirlinesTrain_binary_3fold'] = time.time() * 1000 - before
num_models = num_models + 4 # 1 main model and 3 xval models


print "##############################################################################################"
print "Generating AirlinesTrain Naive Bayes binary classification model. . ."
# R equivalent: h2o.naive_bayes(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, family = "binomial", alpha=0.05, lambda=1.0e-2, standardize=FALSE, nfolds=0)
Expand Down Expand Up @@ -421,7 +542,13 @@ def create_models(self, frame_keys):

for key, value in models['models'].iteritems():
self.assertEquals(value['state'], 'DONE', "Expected state to be DONE for model: " + key)
self.assertTrue(value['training_duration_in_ms'] < durations[key], "Expected training duration as computed by the server (" + str(value['training_duration_in_ms']) + ") to be less than we compute in the test (" + str(durations[key]) + ") for model: " + key)
idx = key.find('_xval')
# For cross-validation models use the time for the parent model, since we should be less
if -1 == idx:
expected = durations[key]
else:
expected = durations[key[0:idx]]
self.assertTrue(value['training_duration_in_ms'] < expected, "Expected training duration as computed by the server (" + str(value['training_duration_in_ms']) + ") to be less than we compute in the test (" + str(expected) + ") for model: " + key)
self.assertNotEqual(found_problem, True, "Missing models on at least one node.")


Expand Down

0 comments on commit 963d5c7

Please sign in to comment.