-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtree_binning_example.py
51 lines (43 loc) · 1.6 KB
/
tree_binning_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import logging
import numpy as np
from sklearn.datasets import make_regression
from disteval.discretization import TreeBinningSklearn
if __name__ == '__main__':
logging.captureWarnings(True)
logging.basicConfig(
format='%(processName)-10s %(name)s %(levelname)-8s %(message)s',
level=logging.INFO)
n_samples = 10000
X, y = make_regression(n_samples=n_samples)
idx = int(0.5 * len(y))
X_test = X[idx:, :]
X_train = X[:idx, :]
y_train= y[:idx]
logging.info('Binning with at least 100 samples per bin and not more '
'than 10 leafs.')
clf = TreeBinningSklearn(
regression=True,
min_samples_leaf=100,
max_leaf_nodes=10,
random_state=1337)
clf.fit(X_train, y_train)
binned_X_test = clf.digitize(X_test)
logging.info('Histogram of the binned test sample:')
logging.info(np.bincount(binned_X_test))
logging.info('Binning with at least 100 samples per bin and no limit on '
'the number of leafs.')
clf = TreeBinningSklearn(
regression=True,
min_samples_leaf=100,
#max_leaf_nodes=10,
random_state=1337)
clf.fit(X_train, y_train)
binned_X_test = clf.digitize(X_test)
logging.info('Histogram of the binned test sample:')
logging.info(np.bincount(binned_X_test))
logging.info('Using the test sample to prune the tree and ensure at '
'least 100 events in each bin:')
clf.prune(X_test, 100)
binned_X_test = clf.digitize(X_test)
logging.info('Histogram of the pruned binning:')
logging.info(np.bincount(binned_X_test))