ENH Add script to process Wikipedia with HDP

luispedro · luispedro · commit 24ac39d08259 · 2015-03-28T22:24:07.000+01:00
diff --git a/ch04/README.rst b/ch04/README.rst
@@ -49,3 +49,7 @@ Scripts
 
 blei_lda.py
     Computes LDA using the AP Corpus.
+wikitopics_create.py
+    Create the topic model for Wikipedia using LDA (must download wikipedia database first)
+wikitopics_create_hdp.py
+    Create the topic model for Wikipedia using HDP (must download wikipedia database first)
diff --git a/ch04/wikitopics_create_hdp.py b/ch04/wikitopics_create_hdp.py
@@ -0,0 +1,39 @@
+# This code is supporting material for the book
+# Building Machine Learning Systems with Python
+# by Willi Richert and Luis Pedro Coelho
+# published by PACKT Publishing
+#
+# It is made available under the MIT License
+
+from __future__ import print_function
+import logging
+import gensim
+import numpy as np
+
+# Set up logging in order to get progress information as the model is being built:
+logging.basicConfig(
+    format='%(asctime)s : %(levelname)s : %(message)s',
+    level=logging.INFO)
+
+# Load the preprocessed corpus (id2word & mm):
+id2word = gensim.corpora.Dictionary.load_from_text(
+    'data/wiki_en_output_wordids.txt.bz2')
+mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm')
+
+# Calling the constructor is enough to build the model
+# This call will take a few hours!
+model = gensim.models.hdpmodel.HdpModel(
+    corpus=mm,
+    id2word=id2word,
+    chunksize=10000)
+
+# Save the model so we do not need to learn it again.
+model.save('wiki_hdp.pkl')
+
+# Compute the document/topic matrix
+topics = np.zeros((len(mm), model.num_topics))
+for di,doc in enumerate(mm):
+    doc_top = model[doc]
+    for ti,tv in doc_top:
+        topics[di,ti] += tv
+np.save('topics_hdp.npy', topics)