Skip to content

Commit 24ac39d

Browse files
committed
ENH Add script to process Wikipedia with HDP
1 parent 3ae4c9d commit 24ac39d

File tree

2 files changed

+43
-0
lines changed

2 files changed

+43
-0
lines changed

ch04/README.rst

+4
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,7 @@ Scripts
4949

5050
blei_lda.py
5151
Computes LDA using the AP Corpus.
52+
wikitopics_create.py
53+
Create the topic model for Wikipedia using LDA (must download wikipedia database first)
54+
wikitopics_create_hdp.py
55+
Create the topic model for Wikipedia using HDP (must download wikipedia database first)

ch04/wikitopics_create_hdp.py

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# This code is supporting material for the book
2+
# Building Machine Learning Systems with Python
3+
# by Willi Richert and Luis Pedro Coelho
4+
# published by PACKT Publishing
5+
#
6+
# It is made available under the MIT License
7+
8+
from __future__ import print_function
9+
import logging
10+
import gensim
11+
import numpy as np
12+
13+
# Set up logging in order to get progress information as the model is being built:
14+
logging.basicConfig(
15+
format='%(asctime)s : %(levelname)s : %(message)s',
16+
level=logging.INFO)
17+
18+
# Load the preprocessed corpus (id2word & mm):
19+
id2word = gensim.corpora.Dictionary.load_from_text(
20+
'data/wiki_en_output_wordids.txt.bz2')
21+
mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm')
22+
23+
# Calling the constructor is enough to build the model
24+
# This call will take a few hours!
25+
model = gensim.models.hdpmodel.HdpModel(
26+
corpus=mm,
27+
id2word=id2word,
28+
chunksize=10000)
29+
30+
# Save the model so we do not need to learn it again.
31+
model.save('wiki_hdp.pkl')
32+
33+
# Compute the document/topic matrix
34+
topics = np.zeros((len(mm), model.num_topics))
35+
for di,doc in enumerate(mm):
36+
doc_top = model[doc]
37+
for ti,tv in doc_top:
38+
topics[di,ti] += tv
39+
np.save('topics_hdp.npy', topics)

0 commit comments

Comments
 (0)