forked from rapid7/metasploit-framework
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add the crawler mixin and a sample form extractor crawler
git-svn-id: file:///home/svn/framework3/trunk@11025 4d416f70-5f16-0410-b530-b9f4589650da
- Loading branch information
HD Moore
committed
Nov 13, 2010
1 parent
bc2d43d
commit f457ccb
Showing
4 changed files
with
525 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,5 @@ | ||
require 'rubygems' | ||
# Load the Anemone core | ||
require 'anemone/core' | ||
|
||
# Overload the HTTP class | ||
# Overload the HTTP class with a variant that uses Rex::Proto::HTTP | ||
require 'anemone/rex_http' | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,300 @@ | ||
module Msf | ||
|
||
### | ||
# | ||
# This module provides methods for implementing a web crawler | ||
# | ||
### | ||
module Exploit::Remote::HttpCrawler | ||
include Msf::Auxiliary::Report | ||
|
||
def initialize(info = {}) | ||
super | ||
|
||
register_options( | ||
[ | ||
Opt::RHOST, | ||
Opt::RPORT(80), | ||
OptString.new('VHOST', [ false, "HTTP server virtual host" ]), | ||
OptString.new('URI', [ true, "The starting page to crawl", "/"]), | ||
Opt::Proxies, | ||
OptInt.new('MAX_PAGES', [ true, 'The maximum number of pages to crawl per URL', 500]), | ||
OptInt.new('MAX_MINUTES', [ true, 'The maximum number of minutes to spend on each URL', 5]), | ||
OptInt.new('MAX_THREADS', [ true, 'The maximum number of concurrent requests', 4]) | ||
], self.class | ||
) | ||
|
||
register_advanced_options( | ||
[ | ||
OptInt.new('RequestTimeout', [false, 'The maximum number of seconds to wait for a reply', 15]), | ||
OptInt.new('RedirectLimit', [false, 'The maximum number of redirects for a single request', 5]), | ||
OptInt.new('RetryLimit', [false, 'The maximum number of attempts for a single request', 5]), | ||
OptString.new('UserAgent', [true, 'The User-Agent header to use for all requests', | ||
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" | ||
]), | ||
OptString.new('BasicAuthUser', [false, 'The HTTP username to specify for basic authentication']), | ||
OptString.new('BasicAuthPass', [false, 'The HTTP password to specify for basic authentication']), | ||
OptBool.new('SSL', [ false, 'Negotiate SSL for outgoing connections', false]), | ||
OptEnum.new('SSLVersion', [ false, 'Specify the version of SSL that should be used', 'SSL3', ['SSL2', 'SSL23', 'SSL3', 'TLS1']]), | ||
], self.class | ||
) | ||
|
||
register_autofilter_ports([ 80, 8080, 443, 8000, 8888, 8880, 8008, 3000, 8443 ]) | ||
register_autofilter_services(%W{ http https }) | ||
|
||
begin | ||
require 'anemone' | ||
@anemone_loaded = true | ||
rescue ::Exception => e | ||
@anemone_loaded = false | ||
@anemone_error = e | ||
end | ||
end | ||
|
||
def setup | ||
raise RuntimeError, "Could not load Anemone/Nokogiri: #{@anemone_error}" if not @anemone_loaded | ||
super | ||
end | ||
|
||
def cleanup | ||
if @crawler | ||
@crawler.shutdown rescue nil | ||
@crawler = nil | ||
end | ||
super | ||
end | ||
|
||
## | ||
# | ||
# Crawler methods and accessors | ||
# | ||
## | ||
|
||
# A target object for tracking URLs | ||
class WebTarget < ::Hash | ||
def to_url | ||
proto = self[:ssl] ? "https" : "http" | ||
"#{proto}://#{self[:host]}:#{self[:port]}#{self[:path]}" | ||
end | ||
end | ||
|
||
# A custom error to signify we hit the page request cap | ||
class MaximumPageCount < ::RuntimeError | ||
end | ||
|
||
# Some accessors for stat tracking | ||
attr_accessor :targets | ||
attr_accessor :url_count, :url_total, :form_count, :request_count | ||
|
||
|
||
# Entry point for the crawler code | ||
def run_host(ip) | ||
|
||
print_status("Testing #{ip}...") | ||
|
||
self.request_count = 0 | ||
self.form_count = 0 | ||
self.url_count = 0 | ||
self.url_total = 1 | ||
|
||
path,query = datastore['URI'].split('?', 2) | ||
query ||= "" | ||
|
||
t = WebTarget.new | ||
|
||
t.merge!({ | ||
:vhost => vhost, | ||
:host => rhost, | ||
:port => rport, | ||
:ssl => ssl, | ||
:path => path, | ||
:query => query, | ||
:user => datastore['BasicAuthUser'], | ||
:pass => datastore['BasicAuthPass'], | ||
:info => "" | ||
}) | ||
|
||
t[:site] = report_web_site(:wait => true, :host => t[:host], :port => t[:port], :vhost => t[:vhost], :ssl => t[:ssl]) | ||
|
||
print_status("Crawling #{t.to_url}...") | ||
begin | ||
@current_vhost = t[:vhost] | ||
@current_site = t[:site] | ||
::Timeout.timeout(max_crawl_time) { crawl_target(t) } | ||
rescue ::Timeout::Error | ||
print_error("Crawl of #{t.to_url} has reached the configured timeout") | ||
ensure | ||
@current_vhost = nil | ||
end | ||
print_status("Crawl of #{t.to_url} complete") | ||
end | ||
|
||
def get_connection_timeout | ||
datastore['RequestTimeout'] | ||
end | ||
|
||
def max_page_count | ||
datastore['MAX_PAGES'] | ||
end | ||
|
||
def max_crawl_time | ||
datastore['MAX_MINUTES'] * 60.0 | ||
end | ||
|
||
def max_crawl_threads | ||
datastore['MAX_THREADS'] | ||
end | ||
|
||
def get_link_filter | ||
/\.(js|png|jpe?g|bmp|gif|swf|jar|zip|gz|bz2|rar|pdf|docx?|pptx?)$/i | ||
end | ||
|
||
def focus_crawl(page) | ||
page.links | ||
end | ||
|
||
def crawl_target(t) | ||
cnt = 0 | ||
opts = crawler_options(t) | ||
url = t.to_url | ||
|
||
@crawler = ::Anemone::Core.new([url], opts) | ||
@crawler.on_every_page do |page| | ||
cnt += 1 | ||
|
||
self.request_count += 1 | ||
|
||
# Extract any interesting data from the page | ||
crawler_process_page(t, page, cnt) | ||
|
||
# Sync the database every 100 items | ||
if cnt % 100 == 0 | ||
framework.db.sync | ||
end | ||
|
||
# Blow up if we hit our maximum page count | ||
if cnt >= max_page_count | ||
print_error("Maximum page count reached for #{url}") | ||
raise MaximumPageCount, "Maximum page count reached" | ||
end | ||
end | ||
|
||
# Skip link processing based on a regular expression | ||
@crawler.skip_links_like( | ||
get_link_filter | ||
) | ||
|
||
# Focus our crawling on interesting, but not over-crawled links | ||
@crawler.focus_crawl do |page| | ||
focus_crawl(page) | ||
end | ||
|
||
begin | ||
@crawler.run | ||
rescue MaximumPageCount | ||
# No need to print anything else | ||
rescue ::Timeout::Error | ||
# Bubble this up to the top-level handler | ||
raise $! | ||
rescue ::Exception => e | ||
print_error("Crawler Exception: #{url} #{e} #{e.backtrace}") | ||
ensure | ||
@crawler.shutdown rescue nil | ||
@crawler = nil | ||
end | ||
end | ||
|
||
def crawler_process_page(t, page, cnt) | ||
msg = "[#{"%.5d" % cnt}/#{"%.5d" % max_page_count}] #{page.code || "ERR"} - #{@current_site.vhost} - #{page.url}" | ||
case page.code | ||
when 301,302 | ||
if page.headers and page.headers["location"] | ||
print_status(msg + " -> " + page.headers["location"].to_s) | ||
else | ||
print_status(msg) | ||
end | ||
when 500...599 | ||
# XXX: Log the fact that we hit an error page | ||
print_good(msg) | ||
when 401,403 | ||
print_good(msg) | ||
when 200 | ||
print_status(msg) | ||
when 404 | ||
print_error(msg) | ||
else | ||
print_error(msg) | ||
end | ||
end | ||
|
||
def crawler_options(t) | ||
opts = {} | ||
opts[:user_agent] = datastore['UserAgent'] | ||
opts[:verbose] = false | ||
opts[:threads] = max_crawl_threads | ||
opts[:obey_robots_txt] = false | ||
opts[:redirect_limit] = datastore['RedirectLimit'] | ||
opts[:retry_limit] = datastore['RetryLimit'] | ||
opts[:accept_cookies] = true | ||
opts[:depth_limit] = false | ||
opts[:skip_query_strings] = false | ||
opts[:discard_page_bodies] = true | ||
opts[:framework] = framework | ||
opts[:module] = self | ||
opts[:timeout] = get_connection_timeout | ||
opts | ||
end | ||
|
||
|
||
## | ||
# | ||
# Wrappers for getters | ||
# | ||
## | ||
|
||
# | ||
# Returns the target host | ||
# | ||
def rhost | ||
datastore['RHOST'] | ||
end | ||
|
||
# | ||
# Returns the remote port | ||
# | ||
def rport | ||
datastore['RPORT'] | ||
end | ||
|
||
# | ||
# Returns the VHOST of the HTTP server. | ||
# | ||
def vhost | ||
datastore['VHOST'] || datastore['RHOST'] | ||
end | ||
|
||
# | ||
# Returns the boolean indicating SSL | ||
# | ||
def ssl | ||
((datastore.default?('SSL') and rport.to_i == 443) or datastore['SSL']) | ||
end | ||
|
||
# | ||
# Returns the string indicating SSL version | ||
# | ||
def ssl_version | ||
datastore['SSLVersion'] | ||
end | ||
|
||
# | ||
# Returns the configured proxy list | ||
# | ||
def proxies | ||
datastore['Proxies'] | ||
end | ||
|
||
|
||
end | ||
|
||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.