forked from unitedstates/congress
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbills.py
139 lines (109 loc) · 4.11 KB
/
bills.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import utils
import os
import re
import time
from lxml import html
import logging
import bill_info
def run(options):
bill_id = options.get('bill_id', None)
if bill_id:
bill_type, number, congress = utils.split_bill_id(bill_id)
to_fetch = [bill_id]
else:
congress = options.get('congress', utils.current_congress())
to_fetch = bill_ids_for(congress, options)
if not to_fetch:
logging.error("Error figuring out which bills to download, aborting.")
return None
limit = options.get('limit', None)
if limit:
to_fetch = to_fetch[:int(limit)]
if options.get('pages_only', False):
return None
print "Going to fetch %i bills from congress #%s" % (len(to_fetch), congress)
# get the mapping from THOMAS's committee names to THOMAS's committee IDs
# found on the advanced search page. committee_names[congress][name] = ID
# with subcommittee names as the committee name plus a pipe plus the subcommittee
# name.
committee_names = bill_info.fetch_committee_names(congress, options)
errors = []
saved = []
skips = []
for bill_id in to_fetch:
try:
results = bill_info.fetch_bill(bill_id, committee_names, options)
except Exception, e:
if options.get('raise', False):
raise
else:
errors.append((bill_id, e))
continue
if results.get('ok', False):
if results.get('saved', False):
saved.append(bill_id)
logging.info("[%s] Updated bill" % bill_id)
else:
skips.append(bill_id)
logging.error("[%s] Skipping bill: %s" % (bill_id, results['reason']))
else:
errors.append((bill_id, results))
logging.error("[%s] Error: %s" % (bill_id, results['reason']))
if len(errors) > 0:
message = "\nErrors for %s bills:\n" % len(errors)
for bill_id, error in errors:
if isinstance(error, Exception):
message += "[%s] Exception:\n\n" % bill_id
message += utils.format_exception(error)
else:
message += "[%s] %s" % (bill_id, error)
utils.admin(message) # email if possible
logging.error("\nSkipped %s bills." % len(skips))
logging.warning("Saved data for %s bills." % len(saved))
# page through listings for bills of a particular congress
def bill_ids_for(congress, options):
bill_ids = []
bill_type = options.get('bill_type', None)
if bill_type:
bill_types = [bill_type]
else:
bill_types = utils.thomas_types.keys()
for bill_type in bill_types:
# match only links to landing pages of this bill type
# it shouldn't catch stray links outside of the confines of the 100 on the page,
# but if it does, no big deal
link_pattern = "^\s*%s\d+\s*$" % utils.thomas_types[bill_type][1]
# loop through pages and collect the links on each page until
# we hit a page with < 100 results, or no results
offset = 0
while True:
# download page, find the matching links
page = utils.download(
page_for(congress, bill_type, offset),
page_cache_for(congress, bill_type, offset),
options.get('force', False))
if not page:
logging.error("Couldn't download page with offset %i, aborting" % offset)
return None
# extract matching links
doc = html.document_fromstring(page)
links = doc.xpath(
"//a[re:match(text(), '%s')]" % link_pattern,
namespaces={"re": "http://exslt.org/regular-expressions"})
# extract the bill ID from each link
for link in links:
code = link.text.lower().replace(".", "").replace(" ", "")
bill_ids.append("%s-%s" % (code, congress))
if len(links) < 100:
break
offset += 100
# sanity check, while True loops are dangerous
if offset > 100000:
break
return utils.uniq(bill_ids)
def page_for(congress, bill_type, offset):
thomas_type = utils.thomas_types[bill_type][0]
congress = int(congress)
return "http://thomas.loc.gov/cgi-bin/bdquery/d?d%03d:%s:./list/bss/d%03d%s.lst:[[o]]" % (congress, offset, congress, thomas_type)
def page_cache_for(congress, bill_type, offset):
return "%s/bills/pages/%s/%i.html" % (congress, bill_type, offset)