File tree 2 files changed +43
-0
lines changed
2 files changed +43
-0
lines changed Original file line number Diff line number Diff line change @@ -49,3 +49,7 @@ Scripts
49
49
50
50
blei_lda.py
51
51
Computes LDA using the AP Corpus.
52
+ wikitopics_create.py
53
+ Create the topic model for Wikipedia using LDA (must download wikipedia database first)
54
+ wikitopics_create_hdp.py
55
+ Create the topic model for Wikipedia using HDP (must download wikipedia database first)
Original file line number Diff line number Diff line change
1
+ # This code is supporting material for the book
2
+ # Building Machine Learning Systems with Python
3
+ # by Willi Richert and Luis Pedro Coelho
4
+ # published by PACKT Publishing
5
+ #
6
+ # It is made available under the MIT License
7
+
8
+ from __future__ import print_function
9
+ import logging
10
+ import gensim
11
+ import numpy as np
12
+
13
+ # Set up logging in order to get progress information as the model is being built:
14
+ logging .basicConfig (
15
+ format = '%(asctime)s : %(levelname)s : %(message)s' ,
16
+ level = logging .INFO )
17
+
18
+ # Load the preprocessed corpus (id2word & mm):
19
+ id2word = gensim .corpora .Dictionary .load_from_text (
20
+ 'data/wiki_en_output_wordids.txt.bz2' )
21
+ mm = gensim .corpora .MmCorpus ('data/wiki_en_output_tfidf.mm' )
22
+
23
+ # Calling the constructor is enough to build the model
24
+ # This call will take a few hours!
25
+ model = gensim .models .hdpmodel .HdpModel (
26
+ corpus = mm ,
27
+ id2word = id2word ,
28
+ chunksize = 10000 )
29
+
30
+ # Save the model so we do not need to learn it again.
31
+ model .save ('wiki_hdp.pkl' )
32
+
33
+ # Compute the document/topic matrix
34
+ topics = np .zeros ((len (mm ), model .num_topics ))
35
+ for di ,doc in enumerate (mm ):
36
+ doc_top = model [doc ]
37
+ for ti ,tv in doc_top :
38
+ topics [di ,ti ] += tv
39
+ np .save ('topics_hdp.npy' , topics )
You can’t perform that action at this time.
0 commit comments