feat: change api of ic and cv (#85)

abess-team · Apr 18, 2024 · 4742f48 · 4742f48
1 parent 3169cde
commit 4742f48
Show file tree

Hide file tree

Showing 10 changed files with 275 additions and 264 deletions.
diff --git a/docs/source/feature/DataScienceTool.rst b/docs/source/feature/DataScienceTool.rst
@@ -71,16 +71,6 @@ In other places, we presume the sparsity level would be appropriate set. However
 
 Note that when using a list for ``sparsity``, the ``sample_size`` parameter must also be provided to the solver in ``skscope``.
 
-
-.. code-block:: python
-
-    solver = ScopeSolver(
-        dimensionality=p,         ## there are p parameters
-        sparsity=[1, 2, 3, 4, 5], ## the candidate support sizes
-        sample_size=n,            ## the number of samples
-    )
-
-
 There are two ways to evaluate sparsity levels: `Information Criterion`_ and `Cross Validation`_.
 
 
@@ -90,19 +80,42 @@ Information Criterion
 
 Information criterion is a statistical measure used to assess the goodness of fit of a model while penalizing model complexity. It helps in selecting the optimal model from a set of competing models. In the context of sparsity-constrained optimization, information criterion can be used to evaluate different sparsity levels and identify the most suitable support size.
 .. There is another way to evaluate sparsity levels, which is information criterion. The larger the information criterion, the better the model. 
-There are four types of information criterion can be used in ``skscope``: Akaike information criterion `[1]`_, Bayesian information criterion (BIC, `[2]`_), extend BIC `[3]`_, and special information criterion (SIC `[4]`_). 
+There are four types of information criterion can be implemented in ``skscope.utilities``: Akaike information criterion `[1]`_, Bayesian information criterion (BIC, `[2]`_), extend BIC `[3]`_, and special information criterion (SIC `[4]`_). 
 .. If sparsity is list and ``cv=None``, the solver will use information criterions to evaluate the sparsity level. 
-The input parameter ``ic`` in the solvers of skscope can be used to choose the information criterion. The default value is ``ic='sic'``. Here is an example using SIC to find the optimal support size.
+The input parameter ``ic_method`` in the solvers of skscope can be used to choose the information criterion. It should be a method to compute information criterion which has the same parameters with this example:
 
 .. code-block:: python
 
+    def SIC(
+        objective_value: float,
+        dimensionality: int,
+        effective_params_num: int,
+        train_size: int,
+    ):
+        return 2 * objective_value + effective_params_num * np.log(np.log(train_size)) * np.log(dimensionality)
+
+
+Here is an example using SIC to find the optimal support size.
+
+.. code-block:: python
+    import jax.numpy as jnp
+    import numpy as np
+    from skscope.utilities import LinearSIC
+
+    n, p = 100, 10
     solver = ScopeSolver(
         dimensionality=p,        
         sparsity=[1, 2, 3, 4, 5] ## we want to select 1-5 variables
         sample_size=n,           ## the number of samples
-        ic='sic',                ## use SIC to evaluate sparsity levels
+        ic_method=LinearSIC,                ## use SIC to evaluate sparsity levels
     )
+    solver.solve(
+        lambda params: jnp.sum(jnp.square(np.random.randn(n, p) @ params - np.random.randn(n))),
+        jit = True,
+    )
+    print(solver.get_result())
 
+Please note that the effectiveness of information criterion heavily depends on the implementation of the objective function. Before usage, carefully check whether the objective function and the information criterion implementations match.
 
 Cross Validation
 ^^^^^^^^^^^^^^^^^^^^
@@ -149,31 +162,6 @@ To utilizing cross validation `[5]`_, there are some requirements:
 
     params = solver.solve(custom_objective, data = (X, y))
 
-There is a simpler way to use cross validation: let objective function comprise a additional parameter ``index``. In this case, we do not need to set ``split_method``. Below is the illustrative example for this usage. 
-
-.. code-block:: python
-    
-    import jax.numpy as jnp
-    from sklearn.datasets import make_regression
-
-    ## generate data
-    n, p, k= 10, 5, 3
-    X, y, true_params = make_regression(n_samples=n, n_features=p, n_informative=k, coef=True)
-
-    def custom_objective(params, index):
-        return jnp.sum(
-            jnp.square(y[index] - X[index,:] @ params)
-        )
-    
-    solver = ScopeSolver(
-        dimensionality=p,        ## there are p parameters
-        sparsity=[1, 2, 3, 4, 5] ## we want to select 1-5 variables
-        sample_size=n,           ## the number of samples
-        cv=10,                   ## 10-folds use cross validation
-    )
-
-    params = solver.solve(custom_objective)
-
 
 Reference
 ------------------------------

diff --git a/pytest/create_test_model.py b/pytest/create_test_model.py
@@ -22,24 +22,23 @@ def create_linear_model(self):
         )
 
         def linear_model(params):
-            return jnp.sum(jnp.square(Y - jnp.matmul(X, params))) / 2
+            return jnp.sum(jnp.square(Y - jnp.matmul(X, params)))
 
         def linear_model_numpy(params):
-            return np.sum(np.square(Y - np.matmul(X, params))) / 2
+            return np.sum(np.square(Y - np.matmul(X, params)))
 
         def grad_linear_model(params):
-            return -np.matmul(X.T, (Y - np.matmul(X, params)))
+            return -np.matmul(X.T, (Y - np.matmul(X, params))) * 2
 
         def hess_linear_model(params):
-            return np.matmul(X.T, X)
+            return np.matmul(X.T, X) * 2
 
         X_jnp = jnp.array(X)
         Y_jnp = jnp.array(Y)
 
-        def linear_model_data(params, data_indices):
-            return jnp.sum(
-                jnp.square(Y_jnp[data_indices] - X_jnp[data_indices,] @ params)
-            )
+        def linear_model_data(params, data):
+            x, y = data
+            return jnp.sum(jnp.square(y - x @ params))
 
         return {
             "n_samples": self.N,

diff --git a/pytest/test_args.py b/pytest/test_args.py
@@ -8,7 +8,14 @@
 import pytest
 
 from create_test_model import CreateTestModel
-from skscope import ScopeSolver, BaseSolver, HTPSolver, GraspSolver, IHTSolver
+from skscope import (
+    utilities,
+    ScopeSolver,
+    BaseSolver,
+    HTPSolver,
+    GraspSolver,
+    IHTSolver,
+)
 import skscope._scope as _scope
 
 model_creator = CreateTestModel()
@@ -53,12 +60,12 @@ def test_init_params(model, solver_creator):
 
 @pytest.mark.parametrize("model", models, ids=models_ids)
 @pytest.mark.parametrize("solver_creator", solvers, ids=solvers_ids)
-@pytest.mark.parametrize("ic_type", ["aic", "bic", "sic", "ebic"])
-def test_ic(model, solver_creator, ic_type):
+def test_ic(model, solver_creator):
     solver = solver_creator(
         model["n_features"],
+        [0, model["n_informative"]],
         sample_size=model["n_samples"],
-        ic_type=ic_type,
+        ic_method=utilities.LinearSIC,
     )
     solver.solve(model["loss"], jit=True)
 
@@ -73,8 +80,9 @@ def test_cv_random_split(model, solver_creator):
         [0, model["n_informative"]],
         model["n_samples"],
         cv=2,
+        split_method=lambda data, indeices: (data[0][indeices], data[1][indeices]),
     )
-    solver.solve(model["loss_data"], jit=True)
+    solver.solve(model["loss_data"], data=model["data"], jit=True)
 
     assert set(model["support_set"]) == set(solver.support_set)
 
@@ -92,8 +100,9 @@ def test_cv_given_split(model, solver_creator):
         model["n_samples"],
         cv=n_fold,
         cv_fold_id=cv_fold_id,
+        split_method=lambda data, indeices: (data[0][indeices], data[1][indeices]),
     )
-    solver.solve(model["loss_data"], jit=True)
+    solver.solve(model["loss_data"], data=model["data"], jit=True)
 
     assert set(model["support_set"]) == set(solver.support_set)
 
@@ -190,6 +199,7 @@ def test_scope_args():
         path_type="gs",
         sample_size=linear["n_samples"],
         cv=2,
+        split_method=lambda data, indeices: (data[0][indeices], data[1][indeices]),
         file_log_level="error",
     )
-    solver.solve(linear["loss_data"])
+    solver.solve(linear["loss_data"], data=linear["data"])
diff --git a/pytest/test_except.py b/pytest/test_except.py
@@ -9,7 +9,7 @@
 import re
 
 from create_test_model import CreateTestModel
-from skscope import ScopeSolver, BaseSolver
+from skscope import utilities, ScopeSolver, BaseSolver
 import skscope._scope as _scope
 
 model_creator = CreateTestModel()
@@ -75,52 +75,57 @@ def test_sparsity(model, solver_creator):
 @pytest.mark.parametrize("model", models, ids=models_ids)
 @pytest.mark.parametrize("solver_creator", solvers, ids=solvers_ids)
 def test_ic(model, solver_creator):
-    solver = solver_creator(
-        model["n_features"], model["n_informative"], model["n_samples"]
-    )
-    solver.set_config(ic_type="ic")
+    solver = solver_creator(model["n_features"])
+    with pytest.raises(
+        ValueError,
+        match=re.escape(
+            "ic_method should be provided for choosing sparsity level with information criterion."
+        ),
+    ):
+        solver.solve(model["loss"], jit=True)
+    solver.set_config(ic_method=utilities.LinearSIC)
     with pytest.raises(
         ValueError,
-        match=re.escape("ic_type should be one of ['aic', 'bic', 'sic','ebic']."),
+        match=re.escape("sample_size should be given when using ic_method."),
     ):
         solver.solve(model["loss"], jit=True)
 
 
 @pytest.mark.parametrize("model", models, ids=models_ids)
 @pytest.mark.parametrize("solver_creator", solvers, ids=solvers_ids)
 def test_cv(model, solver_creator):
-    solver = solver_creator(
-        model["n_features"], model["n_informative"], model["n_samples"]
-    )
-    solver.set_config(cv=1 + model["n_samples"])
+    solver = solver_creator(model["n_features"], cv=2)
     with pytest.raises(
         ValueError, match=re.escape("cv should not be greater than sample_size.")
     ):
         solver.solve(model["loss"], jit=True)
-    solver.set_config(cv=model["n_samples"])
+    solver.set_config(sample_size=model["n_samples"])
     with pytest.raises(
         ValueError, match=re.escape("split_method should be provided when cv > 1.")
     ):
-        solver.solve(model["loss"], data=(), jit=True)
+        solver.solve(model["loss"], jit=True)
+    solver.set_config(
+        split_method=lambda data, indeices: (data[0][indeices], data[1][indeices])
+    )
     solver.set_config(cv_fold_id=np.zeros((1, model["n_samples"])))
     with pytest.raises(
         ValueError, match=re.escape("cv_fold_id should be an 1D array of integers.")
     ):
-        solver.solve(model["loss"], jit=True)
+        solver.solve(model["loss_data"], data=model["data"], jit=True)
     solver.set_config(cv_fold_id=np.zeros(1 + model["n_samples"]))
     with pytest.raises(
         ValueError,
         match=re.escape("The length of cv_fold_id should be equal to sample_size."),
     ):
-        solver.solve(model["loss"], jit=True)
+        solver.solve(model["loss_data"], data=model["data"], jit=True)
     solver.set_config(cv_fold_id=np.zeros(model["n_samples"]))
     with pytest.raises(
         ValueError,
         match=re.escape(
             "The number of different elements in cv_fold_id should be equal to cv."
         ),
     ):
-        solver.solve(model["loss"], jit=True)
+        solver.solve(model["loss_data"], data=model["data"], jit=True)
 
 
 @pytest.mark.parametrize("model", models, ids=models_ids)

diff --git a/skscope/__init__.py b/skscope/__init__.py
@@ -4,7 +4,7 @@
 # Copyright (C) 2023 abess-team
 # Licensed under the MIT License.
 
-__version__ = "0.1.6"
+__version__ = "0.1.7"
 __author__ = "Zezhi Wang, Jin Zhu," "Peng Chen," "Junxian Zhu, Xueqin Wang"