Skip to content

Commit

Permalink
MAINT Param validation for SpectralClustering (scikit-learn#23851)
Browse files Browse the repository at this point in the history
Co-authored-by: jeremiedbb <[email protected]>
Co-authored-by: Meekail Zain <[email protected]>
  • Loading branch information
3 people authored Jul 21, 2022
1 parent cb8ec24 commit 46623ef
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 134 deletions.
90 changes: 36 additions & 54 deletions sklearn/cluster/_spectral.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# Andrew Knyazev <[email protected]>
# License: BSD 3 clause

import numbers
from numbers import Integral, Real
import warnings

import numpy as np
Expand All @@ -15,8 +15,9 @@
from scipy.sparse import csc_matrix

from ..base import BaseEstimator, ClusterMixin
from ..utils import check_random_state, as_float_array, check_scalar
from ..metrics.pairwise import pairwise_kernels
from ..utils._param_validation import Interval, StrOptions
from ..utils import check_random_state, as_float_array
from ..metrics.pairwise import pairwise_kernels, KERNEL_PARAMS
from ..neighbors import kneighbors_graph, NearestNeighbors
from ..manifold import spectral_embedding
from ._kmeans import k_means
Expand Down Expand Up @@ -426,8 +427,9 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
but may also lead to instabilities. If None, then ``'arpack'`` is
used. See [4]_ for more details regarding `'lobpcg'`.
n_components : int, default=n_clusters
Number of eigenvectors to use for the spectral embedding.
n_components : int, default=None
Number of eigenvectors to use for the spectral embedding. If None,
defaults to `n_clusters`.
random_state : int, RandomState instance, default=None
A pseudo random number generator used for the initialization
Expand Down Expand Up @@ -615,6 +617,33 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
random_state=0)
"""

_parameter_constraints = {
"n_clusters": [Interval(Integral, 1, None, closed="left")],
"eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
"n_components": [Interval(Integral, 1, None, closed="left"), None],
"random_state": ["random_state"],
"n_init": [Interval(Integral, 1, None, closed="left")],
"gamma": [Interval(Real, 0, None, closed="neither")],
"affinity": [
callable,
StrOptions(
set(KERNEL_PARAMS)
| {"nearest_neighbors", "precomputed", "precomputed_nearest_neighbors"}
),
],
"n_neighbors": [Interval(Integral, 1, None, closed="left")],
"eigen_tol": [
Interval(Real, 0.0, None, closed="left"),
StrOptions({"auto"}),
],
"assign_labels": [StrOptions({"kmeans", "discretize", "cluster_qr"})],
"degree": [Interval(Integral, 1, None, closed="left")],
"coef0": [Interval(Real, None, None, closed="neither")],
"kernel_params": [dict, None],
"n_jobs": [Integral, None],
"verbose": ["verbose"],
}

def __init__(
self,
n_clusters=8,
Expand Down Expand Up @@ -672,6 +701,8 @@ def fit(self, X, y=None):
self : object
A fitted instance of the estimator.
"""
self._validate_params()

X = self._validate_data(
X,
accept_sparse=["csr", "csc", "coo"],
Expand All @@ -690,55 +721,6 @@ def fit(self, X, y=None):
"set ``affinity=precomputed``."
)

check_scalar(
self.n_clusters,
"n_clusters",
target_type=numbers.Integral,
min_val=1,
include_boundaries="left",
)

check_scalar(
self.n_init,
"n_init",
target_type=numbers.Integral,
min_val=1,
include_boundaries="left",
)

check_scalar(
self.gamma,
"gamma",
target_type=numbers.Real,
min_val=1.0,
include_boundaries="left",
)

check_scalar(
self.n_neighbors,
"n_neighbors",
target_type=numbers.Integral,
min_val=1,
include_boundaries="left",
)

if self.eigen_tol != "auto":
check_scalar(
self.eigen_tol,
"eigen_tol",
target_type=numbers.Real,
min_val=0,
include_boundaries="left",
)

check_scalar(
self.degree,
"degree",
target_type=numbers.Integral,
min_val=1,
include_boundaries="left",
)

if self.affinity == "nearest_neighbors":
connectivity = kneighbors_graph(
X, n_neighbors=self.n_neighbors, include_self=True, n_jobs=self.n_jobs
Expand Down
79 changes: 0 additions & 79 deletions sklearn/cluster/tests/test_spectral.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from sklearn.cluster import SpectralClustering, spectral_clustering
from sklearn.cluster._spectral import discretize, cluster_qr
from sklearn.feature_extraction import img_to_graph
from sklearn.metrics import pairwise_distances
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics.pairwise import kernel_metrics, rbf_kernel
from sklearn.neighbors import NearestNeighbors
Expand Down Expand Up @@ -74,84 +73,6 @@ def test_spectral_clustering(eigen_solver, assign_labels):
assert_array_equal(model_copy.labels_, model.labels_)


def test_spectral_unknown_mode():
# Test that SpectralClustering fails with an unknown mode set.
centers = np.array(
[
[0.0, 0.0, 0.0],
[10.0, 10.0, 10.0],
[20.0, 20.0, 20.0],
]
)
X, true_labels = make_blobs(
n_samples=100, centers=centers, cluster_std=1.0, random_state=42
)
D = pairwise_distances(X) # Distance matrix
S = np.max(D) - D # Similarity matrix
S = sparse.coo_matrix(S)
with pytest.raises(ValueError):
spectral_clustering(S, n_clusters=2, random_state=0, eigen_solver="<unknown>")


def test_spectral_unknown_assign_labels():
# Test that SpectralClustering fails with an unknown assign_labels set.
centers = np.array(
[
[0.0, 0.0, 0.0],
[10.0, 10.0, 10.0],
[20.0, 20.0, 20.0],
]
)
X, true_labels = make_blobs(
n_samples=100, centers=centers, cluster_std=1.0, random_state=42
)
D = pairwise_distances(X) # Distance matrix
S = np.max(D) - D # Similarity matrix
S = sparse.coo_matrix(S)
with pytest.raises(ValueError):
spectral_clustering(S, n_clusters=2, random_state=0, assign_labels="<unknown>")


@pytest.mark.parametrize(
"input, params, err_type, err_msg",
[
(X, {"n_clusters": -1}, ValueError, "n_clusters == -1, must be >= 1"),
(X, {"n_clusters": 0}, ValueError, "n_clusters == 0, must be >= 1"),
(
X,
{"n_clusters": 1.5},
TypeError,
"n_clusters must be an instance of int, not float",
),
(X, {"n_init": -1}, ValueError, "n_init == -1, must be >= 1"),
(X, {"n_init": 0}, ValueError, "n_init == 0, must be >= 1"),
(
X,
{"n_init": 1.5},
TypeError,
"n_init must be an instance of int, not float",
),
(X, {"gamma": -1}, ValueError, "gamma == -1, must be >= 1"),
(X, {"gamma": 0}, ValueError, "gamma == 0, must be >= 1"),
(X, {"n_neighbors": -1}, ValueError, "n_neighbors == -1, must be >= 1"),
(X, {"n_neighbors": 0}, ValueError, "n_neighbors == 0, must be >= 1"),
(
X,
{"eigen_tol": -1, "eigen_solver": "arpack"},
ValueError,
"eigen_tol == -1, must be >= 0",
),
(X, {"degree": -1}, ValueError, "degree == -1, must be >= 1"),
(X, {"degree": 0}, ValueError, "degree == 0, must be >= 1"),
],
)
def test_spectral_params_validation(input, params, err_type, err_msg):
"""Check the parameters validation in `SpectralClustering`."""
est = SpectralClustering(**params)
with pytest.raises(err_type, match=err_msg):
est.fit(input)


@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
def test_spectral_clustering_sparse(assign_labels):
X, y = make_blobs(
Expand Down
1 change: 0 additions & 1 deletion sklearn/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,6 @@ def test_estimators_do_not_raise_errors_in_init_or_set_params(Estimator):
"SimpleImputer",
"SkewedChi2Sampler",
"SpectralBiclustering",
"SpectralClustering",
"SpectralCoclustering",
"SpectralEmbedding",
"SplineTransformer",
Expand Down

0 comments on commit 46623ef

Please sign in to comment.