diff --git a/lib/arachnid2.rb b/lib/arachnid2.rb index 1342767..cab5802 100644 --- a/lib/arachnid2.rb +++ b/lib/arachnid2.rb @@ -105,17 +105,16 @@ def initialize(url) # # @return nil # - def crawl(opts = {}, with_watir = false) + def crawl(opts = {}, with_watir = false, &block) if with_watir - crawl_watir(opts, &Proc.new) + crawl_watir(opts, &block) else - Arachnid2::Typhoeus.new(@url).crawl(opts, &Proc.new) + Arachnid2::Typhoeus.new(@url).crawl(opts, &block) end end - def crawl_watir(opts) - Arachnid2::Watir.new(@url).crawl(opts, &Proc.new) + def crawl_watir(opts, &block) + Arachnid2::Watir.new(@url).crawl(opts, &block) end - # https://mudge.name/2011/01/26/passing-blocks-in-ruby-without-block.html end diff --git a/lib/arachnid2/typhoeus.rb b/lib/arachnid2/typhoeus.rb index c04564e..e17fa86 100644 --- a/lib/arachnid2/typhoeus.rb +++ b/lib/arachnid2/typhoeus.rb @@ -9,7 +9,7 @@ def initialize(url) @cached_data = [] end - def crawl(opts = {}) + def crawl(opts = {}, &block) preflight(opts) typhoeus_preflight @@ -20,11 +20,11 @@ def crawl(opts = {}) break if time_to_stop? @global_visited.insert(q) - found_in_cache = use_cache(q, opts, &Proc.new) + found_in_cache = use_cache(q, opts, &block) return if found_in_cache request = ::Typhoeus::Request.new(q, request_options) - requestable = after_request(request, &Proc.new) + requestable = after_request(request, &block) @hydra.queue(request) if requestable end # max_concurrency.times do @@ -35,9 +35,9 @@ def crawl(opts = {}) end # def crawl(opts = {}) private - def after_request(request) + def after_request(request, &block) request.on_complete do |response| - cacheable = use_response(response, &Proc.new) + cacheable = use_response(response, &block) return unless cacheable put_cached_data(response.effective_url, @options, response) @@ -46,19 +46,19 @@ def after_request(request) true end - def use_response(response) + def use_response(response, &block) links = process(response.effective_url, response.body) return unless links - yield response + block.call response vacuum(links, response.effective_url) true end - def use_cache(url, options) + def use_cache(url, options, &block) data = load_data(url, options) - use_response(data, &Proc.new) if data + use_response(data, &block) if data data end diff --git a/lib/arachnid2/watir.rb b/lib/arachnid2/watir.rb index dfe086c..b9299b7 100644 --- a/lib/arachnid2/watir.rb +++ b/lib/arachnid2/watir.rb @@ -10,7 +10,7 @@ def initialize(url) @domain = Adomain[@url] end - def crawl(opts) + def crawl(opts, &block) preflight(opts) watir_preflight @already_retried = false @@ -23,7 +23,7 @@ def crawl(opts) @global_visited.insert(q) - make_request(q, &Proc.new) + make_request(q, &block) end # until @global_queue.empty? ensure @browser.close if @browser rescue nil @@ -31,9 +31,9 @@ def crawl(opts) end private - def make_request(q) + def make_request(q, &block) begin - links = browse_links(q, &Proc.new) + links = browse_links(q, &block) return unless links vacuum(links, browser.url) @@ -53,10 +53,10 @@ def make_request(q) end end - def browse_links(url) + def browse_links(url, &block) return unless navigate(url) - yield browser + block.call browser process(browser.url, browser.body.html) if browser.body.exists? end