Skip to content

Commit

Permalink
Add crawling
Browse files Browse the repository at this point in the history
Slow as a baby, steady as a soldier. We'll find those links in need!
  • Loading branch information
MarkyMarkMcDonald committed Apr 5, 2018
1 parent 8579a28 commit feb1df2
Show file tree
Hide file tree
Showing 12 changed files with 216 additions and 12 deletions.
3 changes: 1 addition & 2 deletions bin/doc_doc
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ require 'doc_doc'

options = DocDoc::Configuration::Options.new(
ARGV[0],
DocDoc::HorseAndBuggy::DEFAULT_THROTTLE,
DocDoc::Configuration::Crawling::DEFAULT_OPTIONS
DocDoc::HorseAndBuggy::DEFAULT_THROTTLE
)

OptionParser.new do |parser|
Expand Down
19 changes: 19 additions & 0 deletions example_crawling_start/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<html>
<body>
<ul>
<li>
<a href="one-link-away.html">One link away</a>
</li>
<li>
<a href="http://localhost:7654/">External site</a>
</li>
</ul>

<h2>Sad Paths:</h2>
<ul>
<li>
<a href="some-page-that-does-not-exist.html">Page that does not exist anymore</a>
</li>
</ul>
</body>
</html>
13 changes: 13 additions & 0 deletions example_crawling_start/one-link-away.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<html>
<body>
<h2>I exist!</h2>
<ul>
<li>
<a href="some-page-that-does-not-exist.html">Page that does not exist anymore</a>
</li>
<li>
<a href="two-links-away.html">Two links away</a>
</li>
</ul>
</body>
</html>
13 changes: 13 additions & 0 deletions example_crawling_start/three-links-away.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<html>
<body>
<h2>I exist!</h2>
<ul>
<li>
<a href="some-page-that-does-not-exist.html">Page that does not exist anymore</a>
</li>
<li>
<a href="four-links-away.html">Four links away</a>
</li>
</ul>
</body>
</html>
13 changes: 13 additions & 0 deletions example_crawling_start/two-links-away.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<html>
<body>
<h2>I exist!</h2>
<ul>
<li>
<a href="some-page-that-does-not-exist.html">Page that does not exist anymore</a>
</li>
<li>
<a href="three-links-away.html">Three links away</a>
</li>
</ul>
</body>
</html>
14 changes: 13 additions & 1 deletion example_external_site/index.html
Original file line number Diff line number Diff line change
@@ -1 +1,13 @@
<h2>This is an externally hosted guide that may be tangentially related to your documentation</h2>
<html>
<body>
<h2>This is an externally hosted guide that may be related to your documentation, but not maintained by you</h2>
<ul>
<li>
<a href="one-link-away.html">Page that exists</a>
</li>
<li>
<a href="some-page-that-does-not-exist.html">Page that does not exist</a>
</li>
</ul>
</body>
</html>
10 changes: 10 additions & 0 deletions example_external_site/one-link-away.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<html>
<body>
<h2>I exist!</h2>
<ul>
<li>
<a href="some-page-that-does-not-exist.html">Page that does not exist anymore</a>
</li>
</ul>
</body>
</html>
49 changes: 42 additions & 7 deletions lib/doc_doc.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,50 @@
module DocDoc
def self.prescription(config)
horse_and_buggy = HorseAndBuggy.new(config.throttle)
patients = Quarantine.new(horse_and_buggy, config.danger_zone).patients
patients = []
treatments = []

treatments = patients.map do |patient|
visit = HouseCall.new(horse_and_buggy, patient, config.danger_zone)
visit.start
illness = visit.illness
Treatment.new(patient, illness, visit) if illness
patient_zero = Patient.new(config.danger_zone)

patients += Quarantine.new(horse_and_buggy, patient_zero.home).patients

treatments += patients.map do |patient|
treat(patient_zero.home, horse_and_buggy, patient)
end.compact

Prescription.new(treatments)
ill_patients = patients.select do |patient|
treatments.map(&:patient).include?(patient)
end

sub_patients = patients
(1..config.crawling_options.max_spiderings).each do
foo = (sub_patients - ill_patients).map do |patient|
[patient, Quarantine.new(horse_and_buggy, patient.home).patients]
end

sub_treatments = foo.flat_map do |sub_patient_zero, sub_sub_patients|
sub_sub_patients.map do |sub_patient|
treat(sub_patient_zero.home, horse_and_buggy, sub_patient)
end.compact
end

treatments += sub_treatments
sub_patients = foo.flat_map do |f| f[1] end
patients += sub_patients
ill_patients = patients.select do |patient|
treatments.map(&:patient).include?(patient)
end
end

Prescription.new(treatments.sort_by(&:starting_location))
end

private

def self.treat(starting_place, horse_and_buggy, patient)
visit = HouseCall.new(horse_and_buggy, patient, starting_place)
visit.start
illness = visit.illness
Treatment.new(patient, illness, visit) if illness
end
end
10 changes: 9 additions & 1 deletion lib/doc_doc/configuration.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@

module DocDoc
module Configuration
Options = Struct.new(:danger_zone, :throttle, :crawling_options)
class Options
attr_reader :danger_zone, :throttle, :crawling_options

def initialize(danger_zone, throttle, crawling_options = Crawling::Options.new)
@danger_zone = danger_zone
@throttle = throttle
@crawling_options = crawling_options
end
end
end
end
9 changes: 8 additions & 1 deletion lib/doc_doc/configuration/crawling.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
module DocDoc
module Configuration
module Crawling
Options = Struct.new(:boundary, :max_spiderings)
class Options
attr_reader :boundary, :max_spiderings

def initialize(boundary = nil, max_spiderings = 0)
@boundary = boundary
@max_spiderings = max_spiderings
end
end
end
end
end
6 changes: 6 additions & 0 deletions lib/doc_doc/treatment.rb
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
module DocDoc
class Treatment
attr_reader :patient

def initialize(patient, illness, house_call)
@patient = patient
@illness = illness
@house_call = house_call
end

def starting_location
@house_call.starting_location
end

def as_json
{
page: @house_call.starting_location,
Expand Down
69 changes: 69 additions & 0 deletions test/crawling_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
require "test_helper"
require "static_server"

class CrawlingTest < Minitest::Test
def test_it_does_something_useful
@server = StaticServer.start('example_crawling_start', 0)
@external_server = StaticServer.start('example_external_site', 7654)

quarantine_entrance = "http://localhost:#{@server.config[:Port]}"
external_site_entrance = "http://localhost:7654"

config = DocDoc::Configuration::Options.new(
quarantine_entrance,
nil,
DocDoc::Configuration::Crawling::Options.new(nil, 2)
)
prescription = DocDoc.prescription(config)

assert_equal(expected_prescription(quarantine_entrance, external_site_entrance), JSON.parse(prescription.to_s))
ensure
@server.shutdown
@external_server.shutdown
end

private

def expected_prescription(quarantine_entrance, external_site)
{
"links" => [
{
"page" => quarantine_entrance,
"href" => "#{quarantine_entrance}/some-page-that-does-not-exist.html",
"error" => {
"type" => "http",
"description" => 404
}
}, {
"page" => "#{quarantine_entrance}/one-link-away.html",
"href" => "#{quarantine_entrance}/some-page-that-does-not-exist.html",
"error" => {
"type" => "http",
"description" => 404
}
}, {
"page" => "#{quarantine_entrance}/two-links-away.html",
"href" => "#{quarantine_entrance}/some-page-that-does-not-exist.html",
"error" => {
"type" => "http",
"description" => 404
}
}, {
"page" => "#{external_site}/",
"href" => "#{external_site}/some-page-that-does-not-exist.html",
"error" => {
"type" => "http",
"description" => 404
}
}, {
"page" => "#{external_site}/one-link-away.html",
"href" => "#{external_site}/some-page-that-does-not-exist.html",
"error" => {
"type" => "http",
"description" => 404
}
}
]
}
end
end

0 comments on commit feb1df2

Please sign in to comment.