Skip to content

Commit c501ebd

Browse files
committed
Collect data with feedparser
1 parent 521596b commit c501ebd

File tree

2 files changed

+207
-0
lines changed

2 files changed

+207
-0
lines changed

python/Clustering.py

+110
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
"""
4+
# @Author: WuLC
5+
# @Date: 2016-12-11 22:04:08
6+
# @Last modified by: WuLC
7+
# @Last Modified time: 2016-12-12 22:50:55
8+
9+
10+
# @Referer: chaper 3 of the book 《programming-collective-intelligence》
11+
12+
import os
13+
import io
14+
import re
15+
import feedparser
16+
from collections import defaultdict
17+
18+
19+
def extract_words(content):
20+
"""extract words from content, just deal with English here
21+
22+
Args:
23+
content (str): content to be extracted words
24+
25+
Returns:
26+
list: a list of words that are extracted from content
27+
"""
28+
txt = re.compile(r'<[^>]+>').sub('',content) # remove tag in the form of <XXXX>
29+
words = re.compile(r'[^A-Z^a-z]+').split(content) # split words by all non-alpha characters
30+
return [word.lower() for word in words if word != ''] # turn all words into lowercase
31+
32+
33+
def parse_rss(target_url):
34+
"""parse a url which is the rss of a blog
35+
36+
Args:
37+
target_url (str): url of the rss
38+
39+
Returns:
40+
(title,word_count): title of the blog and how many times that each word appears in the blog
41+
"""
42+
rss = feedparser.parse(target_url)
43+
word_count = defaultdict(int)
44+
for entry in rss.entries: # traverse all passages of the blog
45+
if 'summary' in entry:
46+
summary = entry.summary # su
47+
else:
48+
summary = entry.description
49+
words = extract_words(entry.title+' '+summary)
50+
for word in words:
51+
word_count[word] += 1
52+
return rss.feed.get('title', 'empty title'), word_count # title can be empty sometimes
53+
54+
55+
def get_content_from_feedlist(feed_list, data_file):
56+
"""extract content from every rss in feedlist, store them in the data_file
57+
reduce the number of total words by selecting those words that appear within maximum and minimum percentage
58+
Args:
59+
feedlist (str): path of the feedlist file, each row represent a rss
60+
data_file (str): path of the data file
61+
62+
Returns:
63+
None
64+
"""
65+
word_appear_count = defaultdict(int) # count thow many blogs does a word appear in
66+
blog_word_count = {} # words of each blog
67+
empty_title_count = 0
68+
for rss_url in file(feed_list):
69+
title, wc = parse_rss(rss_url.strip())
70+
if title == 'empty title': # cannot get title of some rss
71+
empty_title_count += 1
72+
title = title+' %s'%empty_title_count
73+
blog_word_count[title] = wc
74+
for word, count in wc.items():
75+
word_appear_count[word] += 1
76+
77+
# caculate the appearing percentage of each word
78+
# record those words that appear within maximum and minimum percentage
79+
minimum, maximum = 0.1, 0.5
80+
word_list = []
81+
total_blog = len(blog_word_count)
82+
for word, count in word_appear_count.items():
83+
if minimum <= count*1.0/total_blog <= maximum:
84+
word_list.append(word)
85+
86+
# write data into data_file
87+
with io.open(data_file, mode = 'w', encoding = 'utf8') as wf:
88+
wf.write('Blog'.decode('utf8'))
89+
for word in word_list:
90+
wf.write(('\t%s'%word).decode('utf8'))
91+
wf.write('\n'.decode('utf8'))
92+
# words of each blog
93+
for blog_title, blog_words in blog_word_count.items():
94+
wf.write(blog_title.decode('utf8'))
95+
for word in word_list:
96+
if word in blog_words:
97+
wf.write(('\t%s'%blog_words[word]).decode('utf8'))
98+
else:
99+
wf.write(('\t'+'0').decode('utf8'))
100+
wf.write('\n'.decode('utf8'))
101+
102+
103+
if __name__ == '__main__':
104+
feed_list = 'Clustering_data/feedlist.txt'
105+
data_file = 'Clustering_data/data'
106+
if not os.path.exists(data_file):
107+
get_content_from_feedlist(feed_list, data_file)
108+
109+
110+

python/Clustering_data/feedlist.txt

+97
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
http://feeds.feedburner.com/37signals/beMH
2+
http://feeds.feedburner.com/blogspot/bRuz
3+
http://battellemedia.com/index.xml
4+
http://blog.guykawasaki.com/index.rdf
5+
http://blog.outer-court.com/rss.xml
6+
http://feeds.searchenginewatch.com/sewblog
7+
http://blog.topix.net/index.rdf
8+
http://blogs.abcnews.com/theblotter/index.rdf
9+
http://feeds.feedburner.com/ConsumingExperienceFull
10+
http://flagrantdisregard.com/index.php/feed/
11+
http://featured.gigaom.com/feed/
12+
http://gizmodo.com/index.xml
13+
http://gofugyourself.typepad.com/go_fug_yourself/index.rdf
14+
http://googleblog.blogspot.com/rss.xml
15+
http://feeds.feedburner.com/GoogleOperatingSystem
16+
http://headrush.typepad.com/creating_passionate_users/index.rdf
17+
http://feeds.feedburner.com/instapundit/main
18+
http://jeremy.zawodny.com/blog/rss2.xml
19+
http://joi.ito.com/index.rdf
20+
http://feeds.feedburner.com/Mashable
21+
http://michellemalkin.com/index.rdf
22+
http://moblogsmoproblems.blogspot.com/rss.xml
23+
http://newsbusters.org/node/feed
24+
http://beta.blogger.com/feeds/27154654/posts/full?alt=rss
25+
http://feeds.feedburner.com/paulstamatiou
26+
http://powerlineblog.com/index.rdf
27+
http://feeds.feedburner.com/Publishing20
28+
http://radar.oreilly.com/index.rdf
29+
http://scienceblogs.com/pharyngula/index.xml
30+
http://scobleizer.wordpress.com/feed/
31+
http://sethgodin.typepad.com/seths_blog/index.rdf
32+
http://rss.slashdot.org/Slashdot/slashdot
33+
http://thinkprogress.org/feed/
34+
http://feeds.feedburner.com/andrewsullivan/rApM
35+
http://wilwheaton.typepad.com/wwdnbackup/index.rdf
36+
http://www.43folders.com/feed/
37+
http://www.456bereastreet.com/feed.xml
38+
http://www.autoblog.com/rss.xml
39+
http://www.bloggersblog.com/rss.xml
40+
http://www.bloglines.com/rss/about/news
41+
http://www.blogmaverick.com/rss.xml
42+
http://www.boingboing.net/index.rdf
43+
http://www.buzzmachine.com/index.xml
44+
http://www.captainsquartersblog.com/mt/index.rdf
45+
http://www.coolhunting.com/index.rdf
46+
http://feeds.copyblogger.com/Copyblogger
47+
http://feeds.feedburner.com/crooksandliars/YaCP
48+
http://feeds.dailykos.com/dailykos/index.xml
49+
http://www.deadspin.com/index.xml
50+
http://www.downloadsquad.com/rss.xml
51+
http://www.engadget.com/rss.xml
52+
http://www.gapingvoid.com/index.rdf
53+
http://www.gawker.com/index.xml
54+
http://www.gothamist.com/index.rdf
55+
http://www.huffingtonpost.com/raw_feed_index.rdf
56+
http://www.hyperorg.com/blogger/index.rdf
57+
http://www.joelonsoftware.com/rss.xml
58+
http://www.joystiq.com/rss.xml
59+
http://www.kotaku.com/index.xml
60+
http://feeds.kottke.org/main
61+
http://www.lifehack.org/feed/
62+
http://www.lifehacker.com/index.xml
63+
http://littlegreenfootballs.com/weblog/lgf-rss.php
64+
http://www.makezine.com/blog/index.xml
65+
http://www.mattcutts.com/blog/feed/
66+
http://xml.metafilter.com/rss.xml
67+
http://www.mezzoblue.com/rss/index.xml
68+
http://www.micropersuasion.com/index.rdf
69+
http://www.neilgaiman.com/journal/feed/rss.xml
70+
http://www.oilman.ca/feed/
71+
http://www.perezhilton.com/index.xml
72+
http://www.plasticbag.org/index.rdf
73+
http://www.powazek.com/rss.xml
74+
http://www.problogger.net/feed/
75+
http://feeds.feedburner.com/QuickOnlineTips
76+
http://www.readwriteweb.com/rss.xml
77+
http://www.schneier.com/blog/index.rdf
78+
http://scienceblogs.com/sample/combined.xml
79+
http://www.seroundtable.com/index.rdf
80+
http://www.shoemoney.com/feed/
81+
http://www.sifry.com/alerts/index.rdf
82+
http://www.simplebits.com/xml/rss.xml
83+
http://feeds.feedburner.com/Spikedhumor
84+
http://www.stevepavlina.com/blog/feed
85+
http://www.talkingpointsmemo.com/index.xml
86+
http://www.tbray.org/ongoing/ongoing.rss
87+
http://feeds.feedburner.com/TechCrunch
88+
http://www.techdirt.com/techdirt_rss.xml
89+
http://www.techeblog.com/index.php/feed/
90+
http://www.thesuperficial.com/index.xml
91+
http://www.tmz.com/rss.xml
92+
http://www.treehugger.com/index.rdf
93+
http://www.tuaw.com/rss.xml
94+
http://www.valleywag.com/index.xml
95+
http://www.we-make-money-not-art.com/index.rdf
96+
http://www.wired.com/rss/index.xml
97+
http://www.wonkette.com/index.xml

0 commit comments

Comments
 (0)