forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbench_tree.py
124 lines (98 loc) · 3.56 KB
/
bench_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""
To run this, you'll need to have installed.
* scikit-learn
Does two benchmarks
First, we fix a training set, increase the number of
samples to classify and plot number of classified samples as a
function of time.
In the second benchmark, we increase the number of dimensions of the
training set, classify a sample and plot the time taken as a function
of the number of dimensions.
"""
import numpy as np
import matplotlib.pyplot as plt
import gc
from datetime import datetime
# to store the results
scikit_classifier_results = []
scikit_regressor_results = []
mu_second = 0.0 + 10 ** 6 # number of microseconds in a second
def bench_scikit_tree_classifier(X, Y):
"""Benchmark with scikit-learn decision tree classifier"""
from sklearn.tree import DecisionTreeClassifier
gc.collect()
# start time
tstart = datetime.now()
clf = DecisionTreeClassifier()
clf.fit(X, Y).predict(X)
delta = (datetime.now() - tstart)
# stop time
scikit_classifier_results.append(
delta.seconds + delta.microseconds / mu_second)
def bench_scikit_tree_regressor(X, Y):
"""Benchmark with scikit-learn decision tree regressor"""
from sklearn.tree import DecisionTreeRegressor
gc.collect()
# start time
tstart = datetime.now()
clf = DecisionTreeRegressor()
clf.fit(X, Y).predict(X)
delta = (datetime.now() - tstart)
# stop time
scikit_regressor_results.append(
delta.seconds + delta.microseconds / mu_second)
if __name__ == '__main__':
print('============================================')
print('Warning: this is going to take a looong time')
print('============================================')
n = 10
step = 10000
n_samples = 10000
dim = 10
n_classes = 10
for i in range(n):
print('============================================')
print('Entering iteration %s of %s' % (i, n))
print('============================================')
n_samples += step
X = np.random.randn(n_samples, dim)
Y = np.random.randint(0, n_classes, (n_samples,))
bench_scikit_tree_classifier(X, Y)
Y = np.random.randn(n_samples)
bench_scikit_tree_regressor(X, Y)
xx = range(0, n * step, step)
plt.figure('scikit-learn tree benchmark results')
plt.subplot(211)
plt.title('Learning with varying number of samples')
plt.plot(xx, scikit_classifier_results, 'g-', label='classification')
plt.plot(xx, scikit_regressor_results, 'r-', label='regression')
plt.legend(loc='upper left')
plt.xlabel('number of samples')
plt.ylabel('Time (s)')
scikit_classifier_results = []
scikit_regressor_results = []
n = 10
step = 500
start_dim = 500
n_classes = 10
dim = start_dim
for i in range(0, n):
print('============================================')
print('Entering iteration %s of %s' % (i, n))
print('============================================')
dim += step
X = np.random.randn(100, dim)
Y = np.random.randint(0, n_classes, (100,))
bench_scikit_tree_classifier(X, Y)
Y = np.random.randn(100)
bench_scikit_tree_regressor(X, Y)
xx = np.arange(start_dim, start_dim + n * step, step)
plt.subplot(212)
plt.title('Learning in high dimensional spaces')
plt.plot(xx, scikit_classifier_results, 'g-', label='classification')
plt.plot(xx, scikit_regressor_results, 'r-', label='regression')
plt.legend(loc='upper left')
plt.xlabel('number of dimensions')
plt.ylabel('Time (s)')
plt.axis('tight')
plt.show()