forked from WuLC/MachineLearningAlgorithm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGetData.py
135 lines (112 loc) · 4.53 KB
/
GetData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# -*- coding: utf-8 -*-
"""
"""
# @Author: WuLC
# @Date: 2016-12-11 22:04:08
# @Last modified by: WuLC
# @Last Modified time: 2017-02-12 17:28:07
# @Email: [email protected]
# @Referer: chapter 3 of the book 《programming-collective-intelligence》
############################################################################
# get data from feedlist and store it in the data file for experiment
###########################################################################
import os
import io
import re
import feedparser
from collections import defaultdict
def extract_words(content):
"""extract words from content, just deal with English here
Args:
content (str): content to be extracted words
Returns:
list: a list of words that are extracted from content
"""
txt = re.compile(r'<[^>]+>').sub('',content) # remove tag in the form of <XXXX>
words = re.compile(r'[^A-Z^a-z]+').split(content) # split words by all non-alpha characters
return [word.lower() for word in words if word != ''] # turn all words into lowercase
def parse_rss(target_url):
"""parse a url which is the rss of a blog
Args:
target_url (str): url of the rss
Returns:
(title,word_count): title of the blog and how many times that each word appears in the blog
"""
rss = feedparser.parse(target_url)
word_count = defaultdict(int)
for entry in rss.entries: # traverse all passages of the blog
if 'summary' in entry:
summary = entry.summary # su
else:
summary = entry.description
words = extract_words(entry.title+' '+summary)
for word in words:
word_count[word] += 1
return rss.feed.get('title', 'empty title'), word_count # title can be empty sometimes
def get_content_from_feedlist(feed_list, data_file):
"""extract content from every rss in feedlist, store them in the data_file
reduce the number of total words by selecting those words that appear within maximum and minimum percentage
Args:
feedlist (str): path of the feedlist file, each row represent a rss
data_file (str): path of the data file
Returns:
None
"""
word_appear_count = defaultdict(int) # count thow many blogs does a word appear in
blog_word_count = {} # words of each blog
empty_title_count = 0
for rss_url in file(feed_list):
title, wc = parse_rss(rss_url.strip())
if title == 'empty title': # cannot get title of some rss
empty_title_count += 1
title = title+' %s'%empty_title_count
blog_word_count[title] = wc
for word, count in wc.items():
word_appear_count[word] += 1
# caculate the appearing percentage of each word
# record those words that appear within maximum and minimum percentage
minimum, maximum = 0.1, 0.5
word_list = []
total_blog = len(blog_word_count)
for word, count in word_appear_count.items():
if minimum <= count*1.0/total_blog <= maximum:
word_list.append(word)
# write data into data_file
with io.open(data_file, mode = 'w', encoding = 'utf8') as wf:
wf.write('Blog'.decode('utf8'))
for word in word_list:
wf.write(('\t%s'%word).decode('utf8'))
wf.write('\n'.decode('utf8'))
# words of each blog
for blog_title, blog_words in blog_word_count.items():
wf.write(blog_title.decode('utf8'))
for word in word_list:
if word in blog_words:
wf.write(('\t%s'%blog_words[word]).decode('utf8'))
else:
wf.write(('\t'+'0').decode('utf8'))
wf.write('\n'.decode('utf8'))
def read_data(data_file):
"""read content from the formatted data file
Args:
data_file (str): path of the formatted data file
Returns:
TYPE
"""
col_names = None
blog_names = []
blog_data = []
with io.open(data_file, mode = 'r', encoding = 'utf8') as rf:
for line in rf:
if col_names == None:
col_names = line.strip().split('\t')[1:]
else:
words = line.strip().split('\t')
blog_names.append(words[0])
blog_data.append([float(x) for x in words[1:]])
return col_names, blog_names, blog_data
if __name__ == '__main__':
feed_list = 'Clustering_data/feedlist.txt'
data_file = 'Clustering_data/data'
if not os.path.exists(data_file):
get_content_from_feedlist(feed_list, data_file)