Collect data with feedparser

WuLC · WuLC · commit c501ebd4d6b4 · 2016-12-12T23:29:04.000+08:00
diff --git a/python/Clustering.py b/python/Clustering.py
@@ -0,0 +1,110 @@
+# -*- coding: utf-8 -*-
+"""
+"""
+# @Author: WuLC
+# @Date:   2016-12-11 22:04:08
+# @Last modified by:   WuLC
+# @Last Modified time: 2016-12-12 22:50:55
+# @Email: liangchaowu5@gmail.com
+
+# @Referer: chaper 3 of the book 《programming-collective-intelligence》
+
+import os
+import io
+import re
+import feedparser
+from collections import defaultdict
+
+
+def extract_words(content):
+    """extract words from content, just deal with English here
+    
+    Args:
+        content (str): content to be extracted words
+    
+    Returns:
+        list: a list of words that are extracted from content 
+    """
+    txt = re.compile(r'<[^>]+>').sub('',content)  # remove tag in the form of <XXXX>
+    words = re.compile(r'[^A-Z^a-z]+').split(content) # split words by all non-alpha characters
+    return [word.lower() for word in words if word != ''] # turn all words into lowercase
+
+
+def parse_rss(target_url):
+    """parse a url which is the rss of a blog
+    
+    Args:
+        target_url (str): url of the rss
+    
+    Returns:
+        (title,word_count): title of the blog and how many times that each word appears in the blog
+    """
+    rss = feedparser.parse(target_url)
+    word_count = defaultdict(int)
+    for entry in rss.entries:  # traverse all passages of the blog
+        if 'summary' in entry:
+            summary = entry.summary # su
+        else:
+            summary = entry.description
+        words = extract_words(entry.title+' '+summary)
+        for word in words:
+            word_count[word] += 1
+    return rss.feed.get('title', 'empty title'), word_count  # title can be empty sometimes
+
+
+def get_content_from_feedlist(feed_list, data_file):
+    """extract content from every rss in feedlist, store them in the data_file
+       reduce the number of total words by selecting those words that appear within maximum and minimum percentage
+    Args:
+        feedlist (str): path of the feedlist file, each row represent a rss
+        data_file (str): path of the data file
+    
+    Returns:
+        None
+    """
+    word_appear_count = defaultdict(int) # count thow many blogs does a word appear in
+    blog_word_count = {} # words of each blog
+    empty_title_count = 0
+    for rss_url in file(feed_list):
+        title, wc = parse_rss(rss_url.strip())
+        if title == 'empty title':  # cannot get title of some rss
+            empty_title_count += 1
+            title = title+' %s'%empty_title_count
+        blog_word_count[title] = wc
+        for word, count in wc.items():
+            word_appear_count[word] += 1
+
+    # caculate the appearing percentage of each word
+    # record those words that appear within maximum and minimum percentage 
+    minimum, maximum = 0.1, 0.5
+    word_list = []
+    total_blog = len(blog_word_count)
+    for word, count in word_appear_count.items():
+        if minimum <= count*1.0/total_blog <= maximum:
+            word_list.append(word)
+
+    # write data into data_file 
+    with io.open(data_file, mode = 'w', encoding = 'utf8') as wf:
+        wf.write('Blog'.decode('utf8'))
+        for word in word_list:
+            wf.write(('\t%s'%word).decode('utf8'))
+        wf.write('\n'.decode('utf8'))
+        # words of each blog
+        for blog_title, blog_words in blog_word_count.items():
+            wf.write(blog_title.decode('utf8'))
+            for word in word_list:
+                if word in blog_words:
+                    wf.write(('\t%s'%blog_words[word]).decode('utf8'))
+                else:
+                    wf.write(('\t'+'0').decode('utf8'))
+            wf.write('\n'.decode('utf8'))
+
+
+if __name__ == '__main__':
+    feed_list = 'Clustering_data/feedlist.txt'
+    data_file = 'Clustering_data/data'
+    if not os.path.exists(data_file):
+        get_content_from_feedlist(feed_list, data_file)
+    
+
+    
diff --git a/python/Clustering_data/feedlist.txt b/python/Clustering_data/feedlist.txt
@@ -0,0 +1,97 @@
+http://feeds.feedburner.com/37signals/beMH
+http://feeds.feedburner.com/blogspot/bRuz
+http://battellemedia.com/index.xml
+http://blog.guykawasaki.com/index.rdf
+http://blog.outer-court.com/rss.xml
+http://feeds.searchenginewatch.com/sewblog
+http://blog.topix.net/index.rdf
+http://blogs.abcnews.com/theblotter/index.rdf
+http://feeds.feedburner.com/ConsumingExperienceFull
+http://flagrantdisregard.com/index.php/feed/
+http://featured.gigaom.com/feed/
+http://gizmodo.com/index.xml
+http://gofugyourself.typepad.com/go_fug_yourself/index.rdf
+http://googleblog.blogspot.com/rss.xml
+http://feeds.feedburner.com/GoogleOperatingSystem
+http://headrush.typepad.com/creating_passionate_users/index.rdf
+http://feeds.feedburner.com/instapundit/main
+http://jeremy.zawodny.com/blog/rss2.xml
+http://joi.ito.com/index.rdf
+http://feeds.feedburner.com/Mashable
+http://michellemalkin.com/index.rdf
+http://moblogsmoproblems.blogspot.com/rss.xml
+http://newsbusters.org/node/feed
+http://beta.blogger.com/feeds/27154654/posts/full?alt=rss
+http://feeds.feedburner.com/paulstamatiou
+http://powerlineblog.com/index.rdf
+http://feeds.feedburner.com/Publishing20
+http://radar.oreilly.com/index.rdf
+http://scienceblogs.com/pharyngula/index.xml
+http://scobleizer.wordpress.com/feed/
+http://sethgodin.typepad.com/seths_blog/index.rdf
+http://rss.slashdot.org/Slashdot/slashdot
+http://thinkprogress.org/feed/
+http://feeds.feedburner.com/andrewsullivan/rApM
+http://wilwheaton.typepad.com/wwdnbackup/index.rdf
+http://www.43folders.com/feed/
+http://www.456bereastreet.com/feed.xml
+http://www.autoblog.com/rss.xml
+http://www.bloggersblog.com/rss.xml
+http://www.bloglines.com/rss/about/news
+http://www.blogmaverick.com/rss.xml
+http://www.boingboing.net/index.rdf
+http://www.buzzmachine.com/index.xml
+http://www.captainsquartersblog.com/mt/index.rdf
+http://www.coolhunting.com/index.rdf
+http://feeds.copyblogger.com/Copyblogger
+http://feeds.feedburner.com/crooksandliars/YaCP
+http://feeds.dailykos.com/dailykos/index.xml
+http://www.deadspin.com/index.xml
+http://www.downloadsquad.com/rss.xml
+http://www.engadget.com/rss.xml
+http://www.gapingvoid.com/index.rdf
+http://www.gawker.com/index.xml
+http://www.gothamist.com/index.rdf
+http://www.huffingtonpost.com/raw_feed_index.rdf
+http://www.hyperorg.com/blogger/index.rdf
+http://www.joelonsoftware.com/rss.xml
+http://www.joystiq.com/rss.xml
+http://www.kotaku.com/index.xml
+http://feeds.kottke.org/main
+http://www.lifehack.org/feed/
+http://www.lifehacker.com/index.xml
+http://littlegreenfootballs.com/weblog/lgf-rss.php
+http://www.makezine.com/blog/index.xml
+http://www.mattcutts.com/blog/feed/
+http://xml.metafilter.com/rss.xml
+http://www.mezzoblue.com/rss/index.xml
+http://www.micropersuasion.com/index.rdf
+http://www.neilgaiman.com/journal/feed/rss.xml
+http://www.oilman.ca/feed/
+http://www.perezhilton.com/index.xml
+http://www.plasticbag.org/index.rdf
+http://www.powazek.com/rss.xml
+http://www.problogger.net/feed/
+http://feeds.feedburner.com/QuickOnlineTips
+http://www.readwriteweb.com/rss.xml
+http://www.schneier.com/blog/index.rdf
+http://scienceblogs.com/sample/combined.xml
+http://www.seroundtable.com/index.rdf
+http://www.shoemoney.com/feed/
+http://www.sifry.com/alerts/index.rdf
+http://www.simplebits.com/xml/rss.xml
+http://feeds.feedburner.com/Spikedhumor
+http://www.stevepavlina.com/blog/feed
+http://www.talkingpointsmemo.com/index.xml
+http://www.tbray.org/ongoing/ongoing.rss
+http://feeds.feedburner.com/TechCrunch
+http://www.techdirt.com/techdirt_rss.xml
+http://www.techeblog.com/index.php/feed/
+http://www.thesuperficial.com/index.xml
+http://www.tmz.com/rss.xml
+http://www.treehugger.com/index.rdf
+http://www.tuaw.com/rss.xml
+http://www.valleywag.com/index.xml
+http://www.we-make-money-not-art.com/index.rdf
+http://www.wired.com/rss/index.xml
+http://www.wonkette.com/index.xml