-
Notifications
You must be signed in to change notification settings - Fork 1.4k
/
Copy pathbuild_lda.py
89 lines (76 loc) · 2.39 KB
/
build_lda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License
from __future__ import print_function
try:
import nltk.corpus
except ImportError:
print("nltk not found")
print("please install it")
raise
from scipy.spatial import distance
import numpy as np
from gensim import corpora, models
import sklearn.datasets
import nltk.stem
from collections import defaultdict
english_stemmer = nltk.stem.SnowballStemmer('english')
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.update(['from:', 'subject:', 'writes:', 'writes'])
class DirectText(corpora.textcorpus.TextCorpus):
def get_texts(self):
return self.input
def __len__(self):
return len(self.input)
try:
dataset = sklearn.datasets.load_mlcomp("20news-18828", "train",
mlcomp_root='./data')
except:
print("Newsgroup data not found.")
print("Please download from http://mlcomp.org/datasets/379")
print("And expand the zip into the subdirectory data/")
print()
print()
raise
otexts = dataset.data
texts = dataset.data
texts = [t.decode('utf-8', 'ignore') for t in texts]
texts = [t.split() for t in texts]
texts = [map(lambda w: w.lower(), t) for t in texts]
texts = [filter(lambda s: not len(set("+-.?!()>@012345689") & set(s)), t)
for t in texts]
texts = [filter(lambda s: (len(s) > 3) and (s not in stopwords), t)
for t in texts]
texts = [map(english_stemmer.stem, t) for t in texts]
usage = defaultdict(int)
for t in texts:
for w in set(t):
usage[w] += 1
limit = len(texts) / 10
too_common = [w for w in usage if usage[w] > limit]
too_common = set(too_common)
texts = [filter(lambda s: s not in too_common, t) for t in texts]
corpus = DirectText(texts)
dictionary = corpus.dictionary
try:
dictionary['computer']
except:
pass
model = models.ldamodel.LdaModel(
corpus, num_topics=100, id2word=dictionary.id2token)
thetas = np.zeros((len(texts), 100))
for i, c in enumerate(corpus):
for ti, v in model[c]:
thetas[i, ti] += v
distances = distance.squareform(distance.pdist(thetas))
large = distances.max() + 1
for i in range(len(distances)):
distances[i, i] = large
print(otexts[1])
print()
print()
print()
print(otexts[distances[1].argmin()])