forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdocument_clustering.py
110 lines (83 loc) · 3.13 KB
/
document_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
=======================================
Clustering text documents using k-means
=======================================
This is an example showing how the scikit-learn can be used to cluster
documents by topics using a bag-of-words approach. This example uses
a scipy.sparse matrix to store the features instead of standard numpy arrays.
Two algorithms are demoed: ordinary k-means and its faster cousin minibatch
k-means.
"""
# Author: Peter Prettenhofer <[email protected]>
# Lars Buitinck <[email protected]>
# License: Simplified BSD
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import Vectorizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import logging
from optparse import OptionParser
import sys
from time import time
import numpy as np
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
# parse commandline arguments
op = OptionParser()
op.add_option("--no-minibatch",
action="store_false", dest="minibatch", default=True,
help="Use ordinary k-means algorithm.")
print __doc__
op.print_help()
(opts, args) = op.parse_args()
if len(args) > 0:
op.error("this script takes no arguments.")
sys.exit(1)
###############################################################################
# Load some categories from the training set
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
# Uncomment the following to do the analysis on all the categories
#categories = None
print "Loading 20 newsgroups dataset for categories:"
print categories
dataset = fetch_20newsgroups(subset='all', categories=categories,
shuffle=True, random_state=42)
print "%d documents" % len(dataset.data)
print "%d categories" % len(dataset.target_names)
print
labels = dataset.target
true_k = np.unique(labels).shape[0]
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer(max_df=0.95, max_features=10000)
X = vectorizer.fit_transform(dataset.data)
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X.shape
print
###############################################################################
# Do the actual clustering
if opts.minibatch:
km = MiniBatchKMeans(k=true_k, init='k-means++', n_init=1,
init_size=1000,
batch_size=1000, verbose=1)
else:
km = KMeans(k=true_k, init='random', max_iter=100, n_init=1, verbose=1)
print "Clustering sparse data with %s" % km
t0 = time()
km.fit(X)
print "done in %0.3fs" % (time() - t0)
print
print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)
print "Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)
print "V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)
print "Adjusted Rand-Index: %.3f" % \
metrics.adjusted_rand_score(labels, km.labels_)
print "Silhouette Coefficient: %0.3f" % metrics.silhouette_score(
X, labels, sample_size=1000)
print