-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfetcher.py
108 lines (93 loc) · 2.9 KB
/
fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/python
#----------------------------------------------------------------------
# Copyright (c) 2010 Giuseppe Attardi ([email protected]).
#----------------------------------------------------------------------
"""Feed Fetcher:
Fetches RSS feeds from given set of URLs.
Usage:
fetcher.py [options] file
Options:
-d, --db database file
-h, --help display this help and exit
"""
import sys
import getopt
import time
import feedparser
import re
from store import *
def show_help():
print >> sys.stderr, __doc__
sys.exit(1)
def capitalize(x):
if len(x) > 2 and x[1] == "'":
return x[0:2] + x[2:].capitalize()
else:
return x.capitalize()
def getAuthor(e, description):
"heuristic to extract real author"
author = e.get('author', u'')
if author.startswith('[email protected]'):
m = re.compile('<br />di (.+?)<br').search(description)
if m:
author = m.group(1)
author = ' '.join([capitalize(x) for x in author.split()])
return author
def main():
try:
opts, args = getopt.gnu_getopt(sys.argv[1:], 'd:h', ['db', 'help'])
except getopt.GetoptError:
show_help()
db = 'newsflow'
for opt, arg in opts:
if opt in ('-h', '--help'):
show_help()
elif opt in ('-d', '--db'):
db = arg
if len(args) != 1:
show_help()
try:
list = open(args[0])
except IOError:
print "Can't open file", args[0]
sys.exit(2)
store = StoreCreate(db)
count = 0
for url in list:
if url[0] == '#': # skip comments
continue
d = feedparser.parse(url)
feedTitle = d.feed.get('title', u'')
print feedTitle
feedDescription = d.feed.get('description', u'')
feedLink = d.feed.get('link', u'')
ch = Channel(feedTitle, feedLink, feedDescription)
# FIXME: should be
# ch = Channel(feedLink, feedTitle, feedDescription)
for e in d.entries:
if not 'id' in e:
if not e.link:
continue
e.id = e.link
if store.get(Item, e.id):
continue
count += 1
title = e.get('title', u'')
link = e.get('link', u'')
description = e.get('description', u'')
if 'date_parsed' in e:
t = int(time.mktime(e.date_parsed))
else:
t = time.time()
enclosure = u''
if 'enclosures' in e:
enclosure = unicode(e.enclosures[0])
category = e.get('category', u'')
author = getAuthor(e, description)
item = Item(ch, e.id, title, link, description, t,
enclosure, category, author)
store.add(item)
store.commit()
print "Collected", count, "articles."
if __name__ == '__main__':
main()