-
Notifications
You must be signed in to change notification settings - Fork 112
/
Copy pathupton.rb
353 lines (319 loc) · 13.2 KB
/
upton.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
# encoding: UTF-8
require 'nokogiri'
require 'uri'
require 'restclient'
require_relative 'upton/utils'
require_relative 'upton/downloader'
##
# This module contains a scraper called Upton
##
module Upton
##
# *Upton* is a framework for easy web-scraping with a useful debug mode
# that doesn't hammer your target's servers. It does the repetitive parts of
# writing scrapers, so you only have to write the unique parts for each site.
#
# Upton operates on the theory that, for most scraping projects, you need to
# scrape two types of pages:
#
# 1. Index pages, which list instance pages. For example, a job search
# site's search page or a newspaper's homepage.
# 2. Instance pages, which represent the goal of your scraping, e.g.
# job listings or news articles.
#
# Upton::Scraper can be used as-is for basic use-cases by:
# 1. specifying the pages to be scraped in `new` as an index page
# or as an Array of URLs.
# 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
# block from Upton::Utils.
# For more complicated cases; subclass Upton::Scraper
# e.g. +MyScraper < Upton::Scraper+ and override various methods.
##
class Scraper
EMPTY_STRING = ''
attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
:paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames,
:pagination_interval
##
# This is the main user-facing method for a basic scraper.
# Call +scrape+ with a block; this block will be called on
# the text of each instance page, (and optionally, its URL and its index
# in the list of instance URLs returned by +get_index+).
##
def scrape(&blk)
self.url_array = self.get_index unless self.url_array
blk = Proc.new{|x| x} if blk.nil?
self.scrape_from_list(self.url_array, blk)
end
##
# +index_url_or_array+: A list of string URLs, OR
# the URL of the page containing the list of instances.
# +selector+: The XPath expression or CSS selector that specifies the
# anchor elements within the page, if a url is specified for
# the previous argument.
#
# These options are a shortcut. If you plan to override +get_index+, you
# do not need to set them.
# If you don't specify a selector, the first argument will be treated as a
# list of URLs.
##
def initialize(index_url_or_array, selector="")
#if first arg is a valid URL, do already-written stuff;
#if it's not (or if it's a list?) don't bother with get_index, etc.
#e.g. Scraper.new(["http://jeremybmerrill.com"])
#TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
if index_url_or_array.respond_to? :each_with_index
@url_array = index_url_or_array
else
@index_url = index_url_or_array
@index_selector = selector
end
# If true, then Upton prints information about when it gets
# files from the internet and when it gets them from its stash.
@verbose = false
# If true, then Upton fetches each instance page only once
# future requests for that file are responded to with the locally stashed
# version.
# You may want to set @debug to false for production (but maybe not).
# You can also control stashing behavior on a per-call basis with the
# optional second argument to get_page, if, for instance, you want to
# stash certain instance pages, e.g. based on their modification date.
@debug = true
# Index debug does the same, but for index pages.
@index_debug = false
# In order to not hammer servers, Upton waits for, by default, 30
# seconds between requests to the remote server.
@sleep_time_between_requests = 30 #seconds
# If true, then Upton will attempt to scrape paginated index pages
@paginated = false
# Default query string parameter used to specify the current page
@pagination_param = 'page'
# Default number of paginated pages to scrape
@pagination_max_pages = 2
# Default starting number for pagination (second page is this plus 1).
@pagination_start_index = 1
# Default value to increment page number by
@pagination_interval = 1
# Folder name for stashes, if you want them to be stored somewhere else,
# e.g. under /tmp.
if @stash_folder
FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
end
end
##
# If instance pages are paginated, <b>you must override</b>
# this method to return the next URL, given the current URL and its index.
#
# If instance pages aren't paginated, there's no need to override this.
#
# Recursion stops if the fetching URL returns an empty string or an error.
#
# e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
# ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
##
def next_instance_page_url(url, pagination_index)
EMPTY_STRING
end
##
# Return the next URL to scrape, given the current URL and its index.
#
# Recursion stops if the fetching URL returns an empty string or an error.
#
# If @paginated is not set (the default), this method returns an empty string.
#
# If @paginated is set, this method will return the next pagination URL
# to scrape using @pagination_param and the pagination_index.
#
# If the pagination_index is greater than @pagination_max_pages, then the
# method will return an empty string.
#
# Override this method to handle pagination is an alternative way
# e.g. next_index_page_url("http://whatever.com/articles?page=1", 2)
# ought to return "http://whatever.com/articles?page=2"
#
##
def next_index_page_url(url, pagination_index)
return url unless @paginated
if pagination_index > @pagination_max_pages
puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
EMPTY_STRING
else
uri = URI.parse(url)
query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
# update the pagination query string parameter
query[@pagination_param] = pagination_index
uri.query = URI.encode_www_form(query)
puts "Next index pagination url is #{uri}" if @verbose
uri.to_s
end
end
##
# Writes the scraped result to a CSV at the given filename.
##
def scrape_to_csv filename, &blk
require 'csv'
self.url_array = self.get_index unless self.url_array
CSV.open filename, 'wb' do |csv|
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
self.scrape_from_list(self.url_array, blk).compact.each do |document|
if document[0].respond_to? :map
document.each{|row| csv << row }
else
csv << document
end
end
#self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
end
end
def scrape_to_tsv filename, &blk
require 'csv'
self.url_array = self.get_index unless self.url_array
CSV.open filename, 'wb', :col_sep => "\t" do |csv|
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
self.scrape_from_list(self.url_array, blk).compact.each do |document|
if document[0].respond_to? :map
document.each{|row| csv << row }
else
csv << document
end
end
#self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
end
end
protected
##
# Handles getting pages with Downlader, which handles stashing.
##
def get_page(url, stash=false, options={})
return EMPTY_STRING if url.nil? || url.empty? #url is nil if the <a> lacks an `href` attribute.
global_options = {
:cache => stash,
:verbose => @verbose
}
if @readable_filenames
global_options[:readable_filenames] = true
end
if @stash_folder
global_options[:readable_filenames] = true
global_options[:cache_location] = @stash_folder
end
resp_and_cache = Downloader.new(url, global_options.merge(options)).get
if resp_and_cache[:from_resource]
puts "sleeping #{@sleep_time_between_requests} secs" if @verbose
sleep @sleep_time_between_requests
end
resp_and_cache[:resp]
end
##
# sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
# resolve_url resolves them to absolute urls.
# absolute_url_str must be a URL, as a string that represents an absolute URL or a URI
##
def resolve_url(href_str, absolute_url_str)
if absolute_url_str.class <= URI::Generic
absolute_url = absolute_url_str.dup
else
begin
absolute_url = URI(absolute_url_str).dup
rescue URI::InvalidURIError
raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
end
end
raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
if href_str.class <= URI::Generic
href = href_str.dup
else
begin
href = URI(href_str).dup
rescue URI::InvalidURIError
raise ArgumentError, "#{href_str} must be represent a valid relative or absolute URI"
end
end
# return :href if :href is already absolute
return href.to_s if href.absolute?
#TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
URI.join(absolute_url.to_s, href.to_s).to_s
end
##
# Return a list of URLs for the instances you want to scrape.
# This can optionally be overridden if, for example, the list of instances
# comes from an API.
##
def get_index
index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
end
# TODO: Not sure the best way to handle this
# Currently, #parse_index is called upon #get_index_pages,
# which itself is dependent on @index_url
# Does @index_url stay unaltered for the lifetime of the Upton instance?
# It seems to at this point, but that may be something that gets
# deprecated later
#
# So for now, @index_url is used in conjunction with resolve_url
# to make sure that this method returns absolute urls
# i.e. this method expects @index_url to always have an absolute address
# for the lifetime of an Upton instance
def parse_index(text, selector)
Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
href = a_element["href"]
resolved_url = resolve_url( href, @index_url) unless href.nil?
puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
resolved_url
end
end
##
# Returns the concatenated output of each member of a paginated index,
# e.g. a site listing links with 2+ pages.
##
def get_index_pages(original_url, pagination_index, pagination_interval, options={})
resps = []
prev_url = nil
while resps.empty? || !resps.last.empty?
next_url = self.next_index_page_url(original_url, pagination_index)
break if next_url.empty?
next_url = resolve_url(next_url, original_url)
break if next_url == prev_url
next_resp = self.get_page(next_url, @index_debug, options).to_s
prev_url = next_url
pagination_index += pagination_interval
resps << next_resp
end
resps
end
##
# Returns the instance at `url`.
#
# If the page is stashed, returns that, otherwise, fetches it from the web.
#
# If an instance is paginated, returns the concatenated output of each
# page, e.g. if a news article has two pages.
##
def get_instance(url, pagination_index=0, options={})
resps = [self.get_page(url, @debug, options)]
pagination_index = pagination_index.to_i
prev_url = url
while !resps.last.empty?
next_url = self.next_instance_page_url(url, pagination_index + 1)
break if next_url == prev_url || next_url.empty?
next_resp = self.get_page(next_url, @debug, options)
prev_url = next_url
resps << next_resp
end
resps
end
# Just a helper for +scrape+.
def scrape_from_list(list, blk)
puts "Scraping #{list.size} instances" if @verbose
list.each_with_index.map do |instance_url, instance_index|
instance_resps = get_instance instance_url, nil, :instance_index => instance_index
instance_resps.each_with_index.map do |instance_resp, pagination_index|
blk.call(instance_resp, instance_url, instance_index, pagination_index)
end
end.flatten(1)
end
# it's often useful to have this slug method for uniquely (almost certainly) identifying pages.
def slug(url)
url.split("/")[-1].gsub(/\?.*/, "").gsub(/.html.*/, "")
end
end
end