Add the crawler mixin and a sample form extractor crawler

git-svn-id: file:///home/svn/framework3/trunk@11025 4d416f70-5f16-0410-b530-b9f4589650da
iallison · Nov 13, 2010 · f457ccb · f457ccb
1 parent bc2d43d
commit f457ccb
Show file tree

Hide file tree

Showing 4 changed files with 525 additions and 3 deletions.
diff --git a/lib/anemone.rb b/lib/anemone.rb
@@ -1,6 +1,5 @@
-require 'rubygems'
+# Load the Anemone core
 require 'anemone/core'
 
-# Overload the HTTP class
+# Overload the HTTP class with a variant that uses Rex::Proto::HTTP
 require 'anemone/rex_http'
-
diff --git a/lib/msf/core/exploit/http/crawler.rb b/lib/msf/core/exploit/http/crawler.rb
@@ -0,0 +1,300 @@
+module Msf
+
+###
+#
+# This module provides methods for implementing a web crawler
+#
+###
+module Exploit::Remote::HttpCrawler
+	include Msf::Auxiliary::Report
+
+	def initialize(info = {})
+		super
+
+		register_options(
+			[
+				Opt::RHOST,
+				Opt::RPORT(80),
+				OptString.new('VHOST', [ false, "HTTP server virtual host" ]),
+				OptString.new('URI',   [ true, "The starting page to crawl", "/"]),
+				Opt::Proxies,
+				OptInt.new('MAX_PAGES', [ true, 'The maximum number of pages to crawl per URL', 500]),
+				OptInt.new('MAX_MINUTES', [ true, 'The maximum number of minutes to spend on each URL', 5]),
+				OptInt.new('MAX_THREADS', [ true, 'The maximum number of concurrent requests', 4])					
+			], self.class
+		)
+
+		register_advanced_options(
+			[
+				OptInt.new('RequestTimeout', [false, 'The maximum number of seconds to wait for a reply', 15]),
+				OptInt.new('RedirectLimit', [false, 'The maximum number of redirects for a single request', 5]),
+				OptInt.new('RetryLimit', [false, 'The maximum number of attempts for a single request', 5]),
+				OptString.new('UserAgent', [true, 'The User-Agent header to use for all requests',
+					"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
+				]),
+				OptString.new('BasicAuthUser', [false, 'The HTTP username to specify for basic authentication']),
+				OptString.new('BasicAuthPass', [false, 'The HTTP password to specify for basic authentication']),
+				OptBool.new('SSL', [ false, 'Negotiate SSL for outgoing connections', false]),
+				OptEnum.new('SSLVersion', [ false, 'Specify the version of SSL that should be used', 'SSL3', ['SSL2', 'SSL23', 'SSL3', 'TLS1']]),
+			], self.class
+		)
+
+		register_autofilter_ports([ 80, 8080, 443, 8000, 8888, 8880, 8008, 3000, 8443 ])
+		register_autofilter_services(%W{ http https })
+
+		begin
+			require 'anemone'
+			@anemone_loaded = true
+		rescue ::Exception => e
+			@anemone_loaded = false
+			@anemone_error  = e
+		end		
+	end
+
+	def setup
+		raise RuntimeError, "Could not load Anemone/Nokogiri: #{@anemone_error}" if not @anemone_loaded
+		super
+	end
+
+	def cleanup
+		if @crawler
+			@crawler.shutdown rescue nil
+			@crawler = nil
+		end
+		super
+	end
+
+	##
+	#
+	# Crawler methods and accessors
+	#
+	##
+
+	# A target object for tracking URLs
+	class WebTarget < ::Hash
+		def to_url
+			proto = self[:ssl] ? "https" : "http"
+			"#{proto}://#{self[:host]}:#{self[:port]}#{self[:path]}"
+		end
+	end
+
+	# A custom error to signify we hit the page request cap
+	class MaximumPageCount < ::RuntimeError
+	end
+
+	# Some accessors for stat tracking
+	attr_accessor :targets
+	attr_accessor :url_count, :url_total, :form_count, :request_count
+
+
+	# Entry point for the crawler code
+	def run_host(ip)
+
+		print_status("Testing #{ip}...")
+
+		self.request_count = 0
+		self.form_count  = 0
+		self.url_count   = 0
+		self.url_total   = 1
+
+		path,query = datastore['URI'].split('?', 2)
+		query ||= ""
+
+		t = WebTarget.new
+
+		t.merge!({
+			:vhost    => vhost,
+			:host     => rhost,
+			:port     => rport,
+			:ssl      => ssl,
+			:path     => path,
+			:query    => query,
+			:user     => datastore['BasicAuthUser'],
+			:pass     => datastore['BasicAuthPass'],
+			:info     => ""
+		})
+
+		t[:site] = report_web_site(:wait => true, :host => t[:host], :port => t[:port], :vhost => t[:vhost], :ssl => t[:ssl])
+
+		print_status("Crawling #{t.to_url}...")
+		begin
+			@current_vhost = t[:vhost]
+			@current_site  = t[:site]
+			::Timeout.timeout(max_crawl_time) { crawl_target(t) }
+		rescue ::Timeout::Error
+			print_error("Crawl of #{t.to_url} has reached the configured timeout")
+		ensure
+			@current_vhost = nil
+		end
+		print_status("Crawl of #{t.to_url} complete")
+	end
+
+	def get_connection_timeout
+		datastore['RequestTimeout']
+	end
+
+	def max_page_count
+		datastore['MAX_PAGES']
+	end
+
+	def max_crawl_time
+		datastore['MAX_MINUTES'] * 60.0
+	end
+
+	def max_crawl_threads
+		datastore['MAX_THREADS']
+	end
+
+	def get_link_filter
+		/\.(js|png|jpe?g|bmp|gif|swf|jar|zip|gz|bz2|rar|pdf|docx?|pptx?)$/i
+	end
+
+	def focus_crawl(page)
+		page.links
+	end
+
+	def crawl_target(t)
+		cnt  = 0
+		opts = crawler_options(t)
+		url  = t.to_url
+
+		@crawler = ::Anemone::Core.new([url], opts)
+		@crawler.on_every_page do |page|
+			cnt += 1
+
+			self.request_count += 1
+
+			# Extract any interesting data from the page
+			crawler_process_page(t, page, cnt)
+
+			# Sync the database every 100 items
+			if cnt % 100 == 0
+				framework.db.sync
+			end
+
+			# Blow up if we hit our maximum page count
+			if cnt >= max_page_count
+				print_error("Maximum page count reached for #{url}")
+				raise MaximumPageCount, "Maximum page count reached"
+			end
+		end
+
+		# Skip link processing based on a regular expression
+		@crawler.skip_links_like(
+			get_link_filter
+		)
+
+		# Focus our crawling on interesting, but not over-crawled links
+		@crawler.focus_crawl do |page|
+			focus_crawl(page)
+		end
+
+		begin
+			@crawler.run
+		rescue MaximumPageCount
+			# No need to print anything else
+		rescue ::Timeout::Error
+			# Bubble this up to the top-level handler
+			raise $!
+		rescue ::Exception => e
+			print_error("Crawler Exception: #{url} #{e} #{e.backtrace}")
+		ensure
+			@crawler.shutdown rescue nil
+			@crawler = nil
+		end
+	end
+
+	def crawler_process_page(t, page, cnt)	
+		msg = "[#{"%.5d" % cnt}/#{"%.5d" % max_page_count}]    #{page.code || "ERR"} - #{@current_site.vhost} - #{page.url}"
+		case page.code
+			when 301,302
+				if page.headers and page.headers["location"]
+					print_status(msg + " -> " + page.headers["location"].to_s)
+				else
+					print_status(msg)
+				end
+			when 500...599
+				# XXX: Log the fact that we hit an error page
+				print_good(msg)
+			when 401,403
+				print_good(msg)
+			when 200
+				print_status(msg)
+			when 404
+				print_error(msg)
+			else
+				print_error(msg)
+		end
+	end
+
+	def crawler_options(t)
+		opts = {}
+		opts[:user_agent]      = datastore['UserAgent']
+		opts[:verbose]         = false
+		opts[:threads]         = max_crawl_threads
+		opts[:obey_robots_txt] = false
+		opts[:redirect_limit]  = datastore['RedirectLimit']
+		opts[:retry_limit]     = datastore['RetryLimit']
+		opts[:accept_cookies]  = true
+		opts[:depth_limit]     = false
+		opts[:skip_query_strings]  = false		
+		opts[:discard_page_bodies] = true
+		opts[:framework]           = framework
+		opts[:module]              = self
+		opts[:timeout]             = get_connection_timeout
+		opts
+	end
+
+
+	##
+	#
+	# Wrappers for getters
+	#
+	##
+
+	#
+	# Returns the target host
+	#
+	def rhost
+		datastore['RHOST']
+	end
+
+	#
+	# Returns the remote port
+	#
+	def rport
+		datastore['RPORT']
+	end
+
+	#
+	# Returns the VHOST of the HTTP server.
+	#
+	def vhost
+		datastore['VHOST'] || datastore['RHOST']
+	end
+
+	#
+	# Returns the boolean indicating SSL
+	#
+	def ssl
+		((datastore.default?('SSL') and rport.to_i == 443) or datastore['SSL'])
+	end
+
+	#
+	# Returns the string indicating SSL version
+	#
+	def ssl_version
+		datastore['SSLVersion']
+	end
+
+	#
+	# Returns the configured proxy list
+	#
+	def proxies
+		datastore['Proxies']
+	end
+
+
+end
+
+end
diff --git a/lib/msf/core/exploit/mixins.rb b/lib/msf/core/exploit/mixins.rb
@@ -34,6 +34,7 @@
 require 'msf/core/exploit/ftpserver'
 require 'msf/core/exploit/http/client'
 require 'msf/core/exploit/http/server'
+require 'msf/core/exploit/http/crawler'
 require 'msf/core/exploit/smtp'
 require 'msf/core/exploit/dcerpc'
 require 'msf/core/exploit/sunrpc'