-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
Copy pathtest_validation.py
132 lines (105 loc) · 4.73 KB
/
test_validation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import numpy as np
import pandas as pd
import sklearn.datasets
import sklearn.model_selection
from scipy import sparse
from autosklearn.data.validation import InputValidator
import pytest
@pytest.mark.parametrize("openmlid", [2, 40975, 40984])
@pytest.mark.parametrize("as_frame", [True, False])
def test_data_validation_for_classification(openmlid, as_frame):
x, y = sklearn.datasets.fetch_openml(
data_id=openmlid, return_X_y=True, as_frame=as_frame
)
validator = InputValidator(is_classification=True)
if as_frame:
# NaN is not supported in categories, so
# drop columns with them.
nan_cols = [i for i in x.columns if x[i].isnull().any()]
cat_cols = [i for i in x.columns if x[i].dtype.name in ["category", "bool"]]
unsupported_columns = list(set(nan_cols) & set(cat_cols))
if len(unsupported_columns) > 0:
x.drop(unsupported_columns, axis=1, inplace=True)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
x, y, test_size=0.33, random_state=0
)
validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
X_train_t, y_train_t = validator.transform(X_train, y_train)
assert np.shape(X_train) == np.shape(X_train_t)
# Leave columns that are complete NaN
# The sklearn pipeline will handle that
if as_frame and np.any(pd.isnull(X_train).values.all(axis=0)):
assert np.any(pd.isnull(X_train_t).values.all(axis=0))
elif not as_frame and np.any(pd.isnull(X_train).all(axis=0)):
assert np.any(pd.isnull(X_train_t).all(axis=0))
# make sure everything was encoded to number
assert np.issubdtype(y_train_t.dtype, np.number)
# Make sure we created a feat type
validator.feature_validator.feat_type is not None
@pytest.mark.parametrize("openmlid", [505, 546, 531])
@pytest.mark.parametrize("as_frame", [True, False])
def test_data_validation_for_regression(openmlid, as_frame):
x, y = sklearn.datasets.fetch_openml(
data_id=openmlid, return_X_y=True, as_frame=as_frame
)
validator = InputValidator(is_classification=False)
if as_frame:
# NaN is not supported in categories, so
# drop columns with them.
nan_cols = [i for i in x.columns if x[i].isnull().any()]
cat_cols = [i for i in x.columns if x[i].dtype.name in ["category", "bool"]]
unsupported_columns = list(set(nan_cols) & set(cat_cols))
if len(unsupported_columns) > 0:
x.drop(unsupported_columns, axis=1, inplace=True)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
x, y, test_size=0.33, random_state=0
)
validator.fit(X_train=X_train, y_train=y_train)
X_train_t, y_train_t = validator.transform(X_train, y_train)
assert np.shape(X_train) == np.shape(X_train_t)
# Leave columns that are complete NaN
# The sklearn pipeline will handle that
if as_frame and np.any(pd.isnull(X_train).values.all(axis=0)):
assert np.any(pd.isnull(X_train_t).values.all(axis=0))
elif not as_frame and np.any(pd.isnull(X_train).all(axis=0)):
assert np.any(pd.isnull(X_train_t).all(axis=0))
validator.feature_validator.feat_type is not None
def test_sparse_data_validation_for_regression():
X, y = sklearn.datasets.make_regression(
n_samples=100, n_features=50, random_state=0
)
X_sp = sparse.coo_matrix(X)
validator = InputValidator(is_classification=False)
validator.fit(X_train=X_sp, y_train=y)
X_t, y_t = validator.transform(X, y)
assert np.shape(X) == np.shape(X_t)
# make sure everything was encoded to number
assert np.issubdtype(X_t.dtype, np.number)
assert np.issubdtype(y_t.dtype, np.number)
# Make sure we can change the sparse format
X_t, y_t = validator.transform(sparse.csr_matrix(X), y)
def test_validation_unsupported():
"""
Makes sure we raise a proper message to the user,
when providing not supported data input
"""
validator = InputValidator()
with pytest.raises(ValueError, match=r"Inconsistent number of train datapoints.*"):
validator.fit(
X_train=np.array([[0, 1, 0], [0, 1, 1]]),
y_train=np.array([0, 1, 0, 0, 0, 0]),
)
with pytest.raises(ValueError, match=r"Inconsistent number of test datapoints.*"):
validator.fit(
X_train=np.array([[0, 1, 0], [0, 1, 1]]),
y_train=np.array([0, 1]),
X_test=np.array([[0, 1, 0], [0, 1, 1]]),
y_test=np.array([0, 1, 0, 0, 0, 0]),
)
with pytest.raises(
ValueError, match=r"Cannot call transform on a validator .*fitted"
):
validator.transform(
X=np.array([[0, 1, 0], [0, 1, 1]]),
y=np.array([0, 1]),
)