Skip to content

Commit 9d523ef

Browse files
Merge pull request h2oai#16001 from h2oai/sy/-MS
h2oaiGH-15915: Craft MS Examples
2 parents 1639968 + 6a76d36 commit 9d523ef

File tree

2 files changed

+203
-0
lines changed

2 files changed

+203
-0
lines changed

h2o-bindings/bin/custom/python/gen_modelselection.py

+101
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,23 @@ def coef_norm(self, predictor_size=None):
4242
4343
:param predictor_size: predictor subset size, will only return model coefficients of that subset size.
4444
:return: list of Python Dicts of coefficients for all models built with different predictor numbers
45+
46+
:examples:
47+
48+
>>> import h2o
49+
>>> from h2o.estimators import H2OModelSelectionEstimator
50+
>>> h2o.init()
51+
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
52+
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
53+
>>> response = "GLEASON"
54+
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
55+
... seed=12345,
56+
... mode="maxr")
57+
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
58+
>>> coeff_norm = maxrModel.coef_norm()
59+
>>> print(coeff_norm)
60+
>>> coeff_norm_3 = maxrModel.coef_norm(predictor_size=3) # print coefficient norm with 3 predictors
61+
>>> print(coeff_norm_3)
4562
"""
4663
model_ids = self._model_json["output"]["best_model_ids"]
4764
if not(self.actual_params["build_glm_model"]) and self.actual_params["mode"]=="maxrsweep":
@@ -95,6 +112,23 @@ def coef(self, predictor_size=None):
95112
96113
:param predictor_size: predictor subset size, will only return model coefficients of that subset size.
97114
:return: list of Python Dicts of coefficients for all models built with different predictor numbers
115+
116+
:examples:
117+
118+
>>> import h2o
119+
>>> from h2o.estimators import H2OModelSelectionEstimator
120+
>>> h2o.init()
121+
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
122+
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
123+
>>> response = "GLEASON"
124+
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
125+
... seed=12345,
126+
... mode="maxr")
127+
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
128+
>>> coeff = maxrModel.coef()
129+
>>> print(coeff)
130+
>>> coeff_3 = maxrModel.coef(predictor_size=3)
131+
>>> print(coeff_3)
98132
"""
99133
if not self.actual_params["build_glm_model"] and self.actual_params["mode"]=="maxrsweep":
100134
coef_names = self._model_json["output"]["coefficient_names"]
@@ -148,6 +182,7 @@ def coef(self, predictor_size=None):
148182
def result(self):
149183
"""
150184
Get result frame that contains information about the model building process like for modelselection and anovaglm.
185+
151186
:return: the H2OFrame that contains information about the model building process like for modelselection and anovaglm.
152187
"""
153188
return H2OFrame._expr(expr=ExprNode("result", ASTId(self.key)))._frame(fill_cache=True)
@@ -225,3 +260,69 @@ def get_best_model_predictors(self):
225260
mode=maxr, the model returned is no longer guaranteed to have the best R2 value.
226261
"""
227262
)
263+
264+
examples = dict(
265+
build_glm_model="""
266+
>>> import h2o
267+
>>> from h2o.estimators import H2OModelSelectionEstimator
268+
>>> h2o.init()
269+
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
270+
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
271+
>>> response = "GLEASON"
272+
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
273+
... seed=12345,
274+
... mode="maxrsweep",
275+
... build_glm_model=True)
276+
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
277+
>>> result = maxrModel.result()
278+
>>> # get the GLM model with the best performance for a fixed predictor size:
279+
>>> one_model = h2o.get_model(result["model_id"][1, 0])
280+
>>> predict = one_model.predict(prostate)
281+
>>> # print a version of the predict frame:
282+
>>> print(predict)
283+
""",
284+
influence="""
285+
>>> import h2o
286+
>>> from h2o.estimators import H2OModelSelectionEstimator
287+
>>> h2o.init()
288+
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
289+
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
290+
>>> response = "GLEASON"
291+
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
292+
... seed=12345,
293+
... mode="maxr",
294+
... influence="dfbetas")
295+
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
296+
>>> glm_rid = maxrModel.get_regression_influence_diagnostics()
297+
>>> print(glm_rid)
298+
""",
299+
p_values_threshold="""
300+
>>> import h2o
301+
>>> from h2o.estimators import H2OModelSelectionEstimator
302+
>>> h2o.init()
303+
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
304+
>>> predictors = ["AGE", "RACE", "CAPSULE", DCAPS", "PSA", "VOL", "DPROS"]
305+
>>> response = "GLEASON"
306+
>>> backwardModel = H2OModelSelectionEstimator(min_predictor_number=2,
307+
... seed=12345,
308+
... mode="backward",
309+
... p_values_threshold=0.001)
310+
>>> backwardModel.train(x=predictors, y=response, training_frame=prostate)
311+
>>> result = backwardModel.result()
312+
>>> print(result)
313+
""",
314+
mode="""
315+
>>> import h2o
316+
>>> from h2o.estimators import H2OModelSelectionEstimator
317+
>>> h2o.init()
318+
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
319+
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
320+
>>> response = "GLEASON"
321+
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
322+
... seed=12345,
323+
... mode="maxr")
324+
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
325+
>>> results = maxrModel.result()
326+
>>> print(results)
327+
"""
328+
)

h2o-py/h2o/estimators/model_selection.py

+102
Original file line numberDiff line numberDiff line change
@@ -1190,6 +1190,21 @@ def mode(self):
11901190
than 'maxr', 'backward' for backward selection.
11911191
11921192
Type: ``Literal["allsubsets", "maxr", "maxrsweep", "backward"]``, defaults to ``"maxr"``.
1193+
1194+
:examples:
1195+
1196+
>>> import h2o
1197+
>>> from h2o.estimators import H2OModelSelectionEstimator
1198+
>>> h2o.init()
1199+
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
1200+
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
1201+
>>> response = "GLEASON"
1202+
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
1203+
... seed=12345,
1204+
... mode="maxr")
1205+
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
1206+
>>> results = maxrModel.result()
1207+
>>> print(results)
11931208
"""
11941209
return self._parms.get("mode")
11951210

@@ -1207,6 +1222,26 @@ def build_glm_model(self):
12071222
themselves. Defaults to false.
12081223
12091224
Type: ``bool``, defaults to ``False``.
1225+
1226+
:examples:
1227+
1228+
>>> import h2o
1229+
>>> from h2o.estimators import H2OModelSelectionEstimator
1230+
>>> h2o.init()
1231+
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
1232+
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
1233+
>>> response = "GLEASON"
1234+
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
1235+
... seed=12345,
1236+
... mode="maxrsweep",
1237+
... build_glm_model=True)
1238+
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
1239+
>>> result = maxrModel.result()
1240+
>>> # get the GLM model with the best performance for a fixed predictor size:
1241+
>>> one_model = h2o.get_model(result["model_id"][1, 0])
1242+
>>> predict = one_model.predict(prostate)
1243+
>>> # print a version of the predict frame:
1244+
>>> print(predict)
12101245
"""
12111246
return self._parms.get("build_glm_model")
12121247

@@ -1222,6 +1257,22 @@ def p_values_threshold(self):
12221257
below this threshold
12231258
12241259
Type: ``float``, defaults to ``0.0``.
1260+
1261+
:examples:
1262+
1263+
>>> import h2o
1264+
>>> from h2o.estimators import H2OModelSelectionEstimator
1265+
>>> h2o.init()
1266+
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
1267+
>>> predictors = ["AGE", "RACE", "CAPSULE", DCAPS", "PSA", "VOL", "DPROS"]
1268+
>>> response = "GLEASON"
1269+
>>> backwardModel = H2OModelSelectionEstimator(min_predictor_number=2,
1270+
... seed=12345,
1271+
... mode="backward",
1272+
... p_values_threshold=0.001)
1273+
>>> backwardModel.train(x=predictors, y=response, training_frame=prostate)
1274+
>>> result = backwardModel.result()
1275+
>>> print(result)
12251276
"""
12261277
return self._parms.get("p_values_threshold")
12271278

@@ -1236,6 +1287,22 @@ def influence(self):
12361287
If set to dfbetas will calculate the difference in beta when a datarow is included and excluded in the dataset.
12371288
12381289
Type: ``Literal["dfbetas"]``.
1290+
1291+
:examples:
1292+
1293+
>>> import h2o
1294+
>>> from h2o.estimators import H2OModelSelectionEstimator
1295+
>>> h2o.init()
1296+
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
1297+
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
1298+
>>> response = "GLEASON"
1299+
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
1300+
... seed=12345,
1301+
... mode="maxr",
1302+
... influence="dfbetas")
1303+
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
1304+
>>> glm_rid = maxrModel.get_regression_influence_diagnostics()
1305+
>>> print(glm_rid)
12391306
"""
12401307
return self._parms.get("influence")
12411308

@@ -1303,6 +1370,23 @@ def coef_norm(self, predictor_size=None):
13031370
13041371
:param predictor_size: predictor subset size, will only return model coefficients of that subset size.
13051372
:return: list of Python Dicts of coefficients for all models built with different predictor numbers
1373+
1374+
:examples:
1375+
1376+
>>> import h2o
1377+
>>> from h2o.estimators import H2OModelSelectionEstimator
1378+
>>> h2o.init()
1379+
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
1380+
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
1381+
>>> response = "GLEASON"
1382+
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
1383+
... seed=12345,
1384+
... mode="maxr")
1385+
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
1386+
>>> coeff_norm = maxrModel.coef_norm()
1387+
>>> print(coeff_norm)
1388+
>>> coeff_norm_3 = maxrModel.coef_norm(predictor_size=3) # print coefficient norm with 3 predictors
1389+
>>> print(coeff_norm_3)
13061390
"""
13071391
model_ids = self._model_json["output"]["best_model_ids"]
13081392
if not(self.actual_params["build_glm_model"]) and self.actual_params["mode"]=="maxrsweep":
@@ -1356,6 +1440,23 @@ def coef(self, predictor_size=None):
13561440
13571441
:param predictor_size: predictor subset size, will only return model coefficients of that subset size.
13581442
:return: list of Python Dicts of coefficients for all models built with different predictor numbers
1443+
1444+
:examples:
1445+
1446+
>>> import h2o
1447+
>>> from h2o.estimators import H2OModelSelectionEstimator
1448+
>>> h2o.init()
1449+
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
1450+
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
1451+
>>> response = "GLEASON"
1452+
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
1453+
... seed=12345,
1454+
... mode="maxr")
1455+
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
1456+
>>> coeff = maxrModel.coef()
1457+
>>> print(coeff)
1458+
>>> coeff_3 = maxrModel.coef(predictor_size=3)
1459+
>>> print(coeff_3)
13591460
"""
13601461
if not self.actual_params["build_glm_model"] and self.actual_params["mode"]=="maxrsweep":
13611462
coef_names = self._model_json["output"]["coefficient_names"]
@@ -1409,6 +1510,7 @@ def coef(self, predictor_size=None):
14091510
def result(self):
14101511
"""
14111512
Get result frame that contains information about the model building process like for modelselection and anovaglm.
1513+
14121514
:return: the H2OFrame that contains information about the model building process like for modelselection and anovaglm.
14131515
"""
14141516
return H2OFrame._expr(expr=ExprNode("result", ASTId(self.key)))._frame(fill_cache=True)

0 commit comments

Comments
 (0)