forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bench_20newsgroups.py
97 lines (83 loc) · 3.47 KB
/
bench_20newsgroups.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from __future__ import print_function, division
from time import time
import argparse
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.metrics import accuracy_score
from sklearn.utils.validation import check_array
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
ESTIMATORS = {
"dummy": DummyClassifier(),
"random_forest": RandomForestClassifier(n_estimators=100,
max_features="sqrt",
min_samples_split=10),
"extra_trees": ExtraTreesClassifier(n_estimators=100,
max_features="sqrt",
min_samples_split=10),
"logistic_regression": LogisticRegression(),
"naive_bayes": MultinomialNB(),
"adaboost": AdaBoostClassifier(n_estimators=10),
}
###############################################################################
# Data
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-e', '--estimators', nargs="+", required=True,
choices=ESTIMATORS)
args = vars(parser.parse_args())
data_train = fetch_20newsgroups_vectorized(subset="train")
data_test = fetch_20newsgroups_vectorized(subset="test")
X_train = check_array(data_train.data, dtype=np.float32,
accept_sparse="csc")
X_test = check_array(data_test.data, dtype=np.float32, accept_sparse="csr")
y_train = data_train.target
y_test = data_test.target
print("20 newsgroups")
print("=============")
print("X_train.shape = {0}".format(X_train.shape))
print("X_train.format = {0}".format(X_train.format))
print("X_train.dtype = {0}".format(X_train.dtype))
print("X_train density = {0}"
"".format(X_train.nnz / np.product(X_train.shape)))
print("y_train {0}".format(y_train.shape))
print("X_test {0}".format(X_test.shape))
print("X_test.format = {0}".format(X_test.format))
print("X_test.dtype = {0}".format(X_test.dtype))
print("y_test {0}".format(y_test.shape))
print()
print("Classifier Training")
print("===================")
accuracy, train_time, test_time = {}, {}, {}
for name in sorted(args["estimators"]):
clf = ESTIMATORS[name]
try:
clf.set_params(random_state=0)
except (TypeError, ValueError):
pass
print("Training %s ... " % name, end="")
t0 = time()
clf.fit(X_train, y_train)
train_time[name] = time() - t0
t0 = time()
y_pred = clf.predict(X_test)
test_time[name] = time() - t0
accuracy[name] = accuracy_score(y_test, y_pred)
print("done")
print()
print("Classification performance:")
print("===========================")
print()
print("%s %s %s %s" % ("Classifier ", "train-time", "test-time",
"Accuracy"))
print("-" * 44)
for name in sorted(accuracy, key=accuracy.get):
print("%s %s %s %s" % (name.ljust(16),
("%.4fs" % train_time[name]).center(10),
("%.4fs" % test_time[name]).center(10),
("%.4f" % accuracy[name]).center(10)))
print()