MAINT Param validation for SpectralClustering (scikit-learn#23851)

Co-authored-by: jeremiedbb <[email protected]> Co-authored-by: Meekail Zain <[email protected]>
maxi-marufo · Jul 21, 2022 · 46623ef · 46623ef
1 parent cb8ec24
commit 46623ef
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 134 deletions.
diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py
@@ -6,7 +6,7 @@
 #         Andrew Knyazev <[email protected]>
 # License: BSD 3 clause
 
-import numbers
+from numbers import Integral, Real
 import warnings
 
 import numpy as np
@@ -15,8 +15,9 @@
 from scipy.sparse import csc_matrix
 
 from ..base import BaseEstimator, ClusterMixin
-from ..utils import check_random_state, as_float_array, check_scalar
-from ..metrics.pairwise import pairwise_kernels
+from ..utils._param_validation import Interval, StrOptions
+from ..utils import check_random_state, as_float_array
+from ..metrics.pairwise import pairwise_kernels, KERNEL_PARAMS
 from ..neighbors import kneighbors_graph, NearestNeighbors
 from ..manifold import spectral_embedding
 from ._kmeans import k_means
@@ -426,8 +427,9 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
         but may also lead to instabilities. If None, then ``'arpack'`` is
         used. See [4]_ for more details regarding `'lobpcg'`.
 
-    n_components : int, default=n_clusters
-        Number of eigenvectors to use for the spectral embedding.
+    n_components : int, default=None
+        Number of eigenvectors to use for the spectral embedding. If None,
+        defaults to `n_clusters`.
 
     random_state : int, RandomState instance, default=None
         A pseudo random number generator used for the initialization
@@ -615,6 +617,33 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
         random_state=0)
     """
 
+    _parameter_constraints = {
+        "n_clusters": [Interval(Integral, 1, None, closed="left")],
+        "eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
+        "n_components": [Interval(Integral, 1, None, closed="left"), None],
+        "random_state": ["random_state"],
+        "n_init": [Interval(Integral, 1, None, closed="left")],
+        "gamma": [Interval(Real, 0, None, closed="neither")],
+        "affinity": [
+            callable,
+            StrOptions(
+                set(KERNEL_PARAMS)
+                | {"nearest_neighbors", "precomputed", "precomputed_nearest_neighbors"}
+            ),
+        ],
+        "n_neighbors": [Interval(Integral, 1, None, closed="left")],
+        "eigen_tol": [
+            Interval(Real, 0.0, None, closed="left"),
+            StrOptions({"auto"}),
+        ],
+        "assign_labels": [StrOptions({"kmeans", "discretize", "cluster_qr"})],
+        "degree": [Interval(Integral, 1, None, closed="left")],
+        "coef0": [Interval(Real, None, None, closed="neither")],
+        "kernel_params": [dict, None],
+        "n_jobs": [Integral, None],
+        "verbose": ["verbose"],
+    }
+
     def __init__(
         self,
         n_clusters=8,
@@ -672,6 +701,8 @@ def fit(self, X, y=None):
         self : object
             A fitted instance of the estimator.
         """
+        self._validate_params()
+
         X = self._validate_data(
             X,
             accept_sparse=["csr", "csc", "coo"],
@@ -690,55 +721,6 @@ def fit(self, X, y=None):
                 "set ``affinity=precomputed``."
             )
 
-        check_scalar(
-            self.n_clusters,
-            "n_clusters",
-            target_type=numbers.Integral,
-            min_val=1,
-            include_boundaries="left",
-        )
-
-        check_scalar(
-            self.n_init,
-            "n_init",
-            target_type=numbers.Integral,
-            min_val=1,
-            include_boundaries="left",
-        )
-
-        check_scalar(
-            self.gamma,
-            "gamma",
-            target_type=numbers.Real,
-            min_val=1.0,
-            include_boundaries="left",
-        )
-
-        check_scalar(
-            self.n_neighbors,
-            "n_neighbors",
-            target_type=numbers.Integral,
-            min_val=1,
-            include_boundaries="left",
-        )
-
-        if self.eigen_tol != "auto":
-            check_scalar(
-                self.eigen_tol,
-                "eigen_tol",
-                target_type=numbers.Real,
-                min_val=0,
-                include_boundaries="left",
-            )
-
-        check_scalar(
-            self.degree,
-            "degree",
-            target_type=numbers.Integral,
-            min_val=1,
-            include_boundaries="left",
-        )
-
         if self.affinity == "nearest_neighbors":
             connectivity = kneighbors_graph(
                 X, n_neighbors=self.n_neighbors, include_self=True, n_jobs=self.n_jobs

diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py
@@ -15,7 +15,6 @@
 from sklearn.cluster import SpectralClustering, spectral_clustering
 from sklearn.cluster._spectral import discretize, cluster_qr
 from sklearn.feature_extraction import img_to_graph
-from sklearn.metrics import pairwise_distances
 from sklearn.metrics import adjusted_rand_score
 from sklearn.metrics.pairwise import kernel_metrics, rbf_kernel
 from sklearn.neighbors import NearestNeighbors
@@ -74,84 +73,6 @@ def test_spectral_clustering(eigen_solver, assign_labels):
         assert_array_equal(model_copy.labels_, model.labels_)
 
 
-def test_spectral_unknown_mode():
-    # Test that SpectralClustering fails with an unknown mode set.
-    centers = np.array(
-        [
-            [0.0, 0.0, 0.0],
-            [10.0, 10.0, 10.0],
-            [20.0, 20.0, 20.0],
-        ]
-    )
-    X, true_labels = make_blobs(
-        n_samples=100, centers=centers, cluster_std=1.0, random_state=42
-    )
-    D = pairwise_distances(X)  # Distance matrix
-    S = np.max(D) - D  # Similarity matrix
-    S = sparse.coo_matrix(S)
-    with pytest.raises(ValueError):
-        spectral_clustering(S, n_clusters=2, random_state=0, eigen_solver="<unknown>")
-
-
-def test_spectral_unknown_assign_labels():
-    # Test that SpectralClustering fails with an unknown assign_labels set.
-    centers = np.array(
-        [
-            [0.0, 0.0, 0.0],
-            [10.0, 10.0, 10.0],
-            [20.0, 20.0, 20.0],
-        ]
-    )
-    X, true_labels = make_blobs(
-        n_samples=100, centers=centers, cluster_std=1.0, random_state=42
-    )
-    D = pairwise_distances(X)  # Distance matrix
-    S = np.max(D) - D  # Similarity matrix
-    S = sparse.coo_matrix(S)
-    with pytest.raises(ValueError):
-        spectral_clustering(S, n_clusters=2, random_state=0, assign_labels="<unknown>")
-
-
-@pytest.mark.parametrize(
-    "input, params, err_type, err_msg",
-    [
-        (X, {"n_clusters": -1}, ValueError, "n_clusters == -1, must be >= 1"),
-        (X, {"n_clusters": 0}, ValueError, "n_clusters == 0, must be >= 1"),
-        (
-            X,
-            {"n_clusters": 1.5},
-            TypeError,
-            "n_clusters must be an instance of int, not float",
-        ),
-        (X, {"n_init": -1}, ValueError, "n_init == -1, must be >= 1"),
-        (X, {"n_init": 0}, ValueError, "n_init == 0, must be >= 1"),
-        (
-            X,
-            {"n_init": 1.5},
-            TypeError,
-            "n_init must be an instance of int, not float",
-        ),
-        (X, {"gamma": -1}, ValueError, "gamma == -1, must be >= 1"),
-        (X, {"gamma": 0}, ValueError, "gamma == 0, must be >= 1"),
-        (X, {"n_neighbors": -1}, ValueError, "n_neighbors == -1, must be >= 1"),
-        (X, {"n_neighbors": 0}, ValueError, "n_neighbors == 0, must be >= 1"),
-        (
-            X,
-            {"eigen_tol": -1, "eigen_solver": "arpack"},
-            ValueError,
-            "eigen_tol == -1, must be >= 0",
-        ),
-        (X, {"degree": -1}, ValueError, "degree == -1, must be >= 1"),
-        (X, {"degree": 0}, ValueError, "degree == 0, must be >= 1"),
-    ],
-)
-def test_spectral_params_validation(input, params, err_type, err_msg):
-    """Check the parameters validation in `SpectralClustering`."""
-    est = SpectralClustering(**params)
-    with pytest.raises(err_type, match=err_msg):
-        est.fit(input)
-
-
 @pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
 def test_spectral_clustering_sparse(assign_labels):
     X, y = make_blobs(

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
@@ -526,7 +526,6 @@ def test_estimators_do_not_raise_errors_in_init_or_set_params(Estimator):
     "SimpleImputer",
     "SkewedChi2Sampler",
     "SpectralBiclustering",
-    "SpectralClustering",
     "SpectralCoclustering",
     "SpectralEmbedding",
     "SplineTransformer",