Sklearn kwargs (dmlc#2338)

* Added kwargs support for Sklearn API * Updated NEWS and CONTRIBUTORS * Fixed CONTRIBUTORS.md * Added clarification of **kwargs and test for proper usage * Fixed lint error * Fixed more lint errors and clf assigned but never used * Fixed more lint errors * Fixed more lint errors * Fixed issue with changes from different branch bleeding over * Fixed issue with changes from other branch bleeding over * Added note that kwargs may not be compatible with Sklearn * Fixed linting on kwargs note
sagunb · May 24, 2017 · 0f3a404 · 0f3a404
1 parent 6cea1e3
commit 0f3a404
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 4 deletions.
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -65,3 +65,4 @@ List of Contributors
 * [Adam Pocock](https://github.com/Craigacp)
 * [Rory Mitchell](https://github.com/RAMitchell)
   - Rory is the author of the GPU plugin and also contributed the cmake build system and windows continuous integration
+* [Gideon Whitehead](https://github.com/gaw89)
diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,9 @@ XGBoost Change Log
 This file records the changes in xgboost library in reverse chronological order.
 
 ## in progress version
+* Updated Sklearn API
+  - Updated to allow use of all XGBoost parameters via **kwargs.
+  - Updated nthread to n_jobs and seed to random_state (as per Sklearn convention).
 * Refactored gbm to allow more friendly cache strategy
   - Specialized some prediction routine
 * Automatically remove nan from input data when it is sparse.

diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
@@ -101,6 +101,14 @@ class XGBModel(XGBModelBase):
     missing : float, optional
         Value in the data which needs to be present as a missing value. If
         None, defaults to np.nan.
+    **kwargs : dict, optional
+        Keyword arguments for XGBoost Booster object.  Full documentation of parameters can
+        be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.md.
+        Attempting to set a parameter via the constructor args and **kwargs dict simultaneously
+        will result in a TypeError.
+        Note:
+            **kwargs is unsupported by Sklearn.  We do not guarantee that parameters passed via
+            this argument will interact properly with Sklearn.
 
     Note
     ----
@@ -124,7 +132,7 @@ def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100,
                  n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0,
                  subsample=1, colsample_bytree=1, colsample_bylevel=1,
                  reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
-                 base_score=0.5, random_state=0, seed=None, missing=None):
+                 base_score=0.5, random_state=0, seed=None, missing=None, **kwargs):
         if not SKLEARN_INSTALLED:
             raise XGBoostError('sklearn needs to be installed in order to use this module')
         self.max_depth = max_depth
@@ -133,7 +141,6 @@ def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100,
         self.silent = silent
         self.objective = objective
         self.booster = booster
-
         self.nthread = nthread
         self.gamma = gamma
         self.min_child_weight = min_child_weight
@@ -146,6 +153,7 @@ def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100,
         self.scale_pos_weight = scale_pos_weight
         self.base_score = base_score
         self.missing = missing if missing is not None else np.nan
+        self.kwargs = kwargs
         self._Booster = None
         if seed:
             warnings.warn('The seed parameter is deprecated as of version .6.'
@@ -192,6 +200,8 @@ def get_booster(self):
     def get_params(self, deep=False):
         """Get parameter.s"""
         params = super(XGBModel, self).get_params(deep=deep)
+        if isinstance(self.kwargs, dict):  # if kwargs is a dict, update params accordingly
+            params.update(self.kwargs)
         if params['missing'] is np.nan:
             params['missing'] = None  # sklearn doesn't handle nan. see #4725
         if not params.get('eval_metric', True):
@@ -388,15 +398,15 @@ def __init__(self, max_depth=3, learning_rate=0.1,
                  n_jobs=1, nthread=None, gamma=0, min_child_weight=1,
                  max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1,
                  reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
-                 base_score=0.5, random_state=0, seed=None, missing=None):
+                 base_score=0.5, random_state=0, seed=None, missing=None, **kwargs):
         super(XGBClassifier, self).__init__(max_depth, learning_rate,
                                             n_estimators, silent, objective, booster,
                                             n_jobs, nthread, gamma, min_child_weight,
                                             max_delta_step, subsample,
                                             colsample_bytree, colsample_bylevel,
                                             reg_alpha, reg_lambda,
                                             scale_pos_weight, base_score,
-                                            random_state, seed, missing)
+                                            random_state, seed, missing, **kwargs)
 
     def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
             early_stopping_rounds=None, verbose=True):

diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
@@ -3,6 +3,7 @@
 import xgboost as xgb
 import testing as tm
 import warnings
+from nose.tools import raises
 
 rng = np.random.RandomState(1994)
 
@@ -363,3 +364,22 @@ def test_nthread_deprecation():
     with warnings.catch_warnings(record=True) as w:
         xgb.XGBClassifier(nthread=1)
         assert w[0].category == DeprecationWarning
+
+
+def test_kwargs():
+    tm._skip_if_no_sklearn()
+
+    params = {'updater': 'grow_gpu', 'subsample': .5, 'n_jobs': -1}
+    clf = xgb.XGBClassifier(n_estimators=1000, **params)
+    assert clf.get_params()['updater'] == 'grow_gpu'
+    assert clf.get_params()['subsample'] == .5
+    assert clf.get_params()['n_estimators'] == 1000
+
+
+@raises(TypeError)
+def test_kwargs_error():
+    tm._skip_if_no_sklearn()
+
+    params = {'updater': 'grow_gpu', 'subsample': .5, 'n_jobs': -1}
+    clf = xgb.XGBClassifier(n_jobs=1000, **params)
+    assert isinstance(clf, xgb.XGBClassifier)