Merge pull request theanti9#8 from WilHall/master

Updates to PyCrawler
code-kshetra · Nov 27, 2011 · fafb8c1 · fafb8c1
2 parents ca6d7a8 + d9ed584
commit fafb8c1
Show file tree

Hide file tree

Showing 7 changed files with 160 additions and 45 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 *.pyc
 .DS_Store
-*.db
+*.db
+*.log
diff --git a/ColorStreamHandler.py b/ColorStreamHandler.py
@@ -0,0 +1,68 @@
+import logging
+import curses
+
+class ColorStreamHandler(logging.Handler):
+
+	def __init__(self, use_colors):
+		logging.Handler.__init__(self)
+		self.use_colors = use_colors
+
+		# Initialize environment
+		curses.setupterm()
+
+		# Get the foreground color attribute for this environment
+		self.fcap = curses.tigetstr('setaf')
+
+		#Get the normal attribute
+		self.COLOR_NORMAL = curses.tigetstr('sgr0')
+
+		# Get + Save the color sequences
+		self.COLOR_INFO = curses.tparm(self.fcap, curses.COLOR_GREEN)
+		self.COLOR_ERROR = curses.tparm(self.fcap, curses.COLOR_RED)
+		self.COLOR_WARNING = curses.tparm(self.fcap, curses.COLOR_YELLOW)
+		self.COLOR_DEBUG = curses.tparm(self.fcap, curses.COLOR_BLUE)
+
+	def color(self, msg, level):
+		if level == "INFO":
+			return "%s%s%s" % (self.COLOR_INFO, msg, self.COLOR_NORMAL)
+		elif level == "WARNING":
+			return "%s%s%s" % (self.COLOR_WARNING, msg, self.COLOR_NORMAL)
+		elif level == "ERROR":
+			return "%s%s%s" % (self.COLOR_ERROR, msg, self.COLOR_NORMAL)
+		elif level == "DEBUG":
+			return "%s%s%s" % (self.COLOR_DEBUG, msg, self.COLOR_NORMAL)
+		else:
+			return msg
+
+	def emit(self, record):
+		record.msg = record.msg.encode('utf-8', 'ignore')
+		msg = self.format(record)
+
+		# This just removes the date and milliseconds from asctime
+		temp = msg.split(']')
+		msg = '[' + temp[0].split(' ')[1].split(',')[0] + ']' + temp[1]
+
+		if self.use_colors:
+			msg = self.color(msg, record.levelname)
+		print msg
+
+# 'record' has the following attributes:
+# threadName
+# name
+# thread
+# created
+# process
+# processName
+# args
+# module
+# filename
+# levelno
+# exc_text
+# pathname
+# lineno
+# msg
+# exc_info
+# funcName
+# relativeCreated
+# levelname
+# msecs
diff --git a/PyCrawler.py b/PyCrawler.py
@@ -1,7 +1,9 @@
 from query import CrawlerDb
 from content_processor import ContentProcessor
-from settings import VERBOSE
-import sys, urlparse, urllib2, robotparser
+from settings import LOGGING
+import sys, urlparse, urllib2, shutil, glob, robotparser
+import logging, logging.config
+import traceback
 
 # ===== Init stuff =====
 
@@ -12,28 +14,32 @@
 # content processor init
 processor = ContentProcessor(None, None, None)
 
+# logging setup
+logging.config.dictConfig(LOGGING)
+logger = logging.getLogger("crawler_logger")
+
 # robot parser init
 robot = robotparser.RobotFileParser()
 
 if len(sys.argv) < 2:
-	print "Error: No start url was passed"
+	logger.info("Error: No start url was passed")
 	sys.exit()
 
 l = sys.argv[1:]
 
 cdb.enqueue(l)
 
 def crawl():
-	print "starting..."
+	logger.info("Starting (%s)..." % sys.argv[1])
 	while True:
 		url = cdb.dequeue()
 		u = urlparse.urlparse(url)
 		robot.set_url('http://'+u[1]+"/robots.txt")
-		if not robot.can_fetch('PyCrawler', url):
-			print "Url disallowed by robots.txt: %s " % url
+		if not robot.can_fetch('PyCrawler', url.encode('ascii', 'replace')):
+			logger.warning("Url disallowed by robots.txt: %s " % url)
 			continue
 		if not url.startswith('http'):
-			print "Unfollowable link found at %s " % url
+			logger.warning("Unfollowable link found at %s " % url)
 			continue
 
 		if cdb.checkCrawled(url):
@@ -48,8 +54,7 @@ def crawl():
 		try:
 			request = urllib2.urlopen(req)
 		except urllib2.URLError, e:
-			print e
-			print "Exception at url: %s" % url
+			logger.error("Exception at url: %s\n%s" % (url, e))
 			continue
 		except urllib2.HTTPError, e:
 			status = e.code
@@ -65,25 +70,26 @@ def crawl():
 			if not cdb.checkCrawled(q):
 				add_queue.append(q)
 
+		processor.setInfo(str(url), status, data)
+		add_queue = processor.process()
 		l = len(add_queue)
-		if VERBOSE:
-			print "Got %s status from %s" % (status, url)
-			print "Found %i links" % l
+		logger.info("Got %s status from %s (Found %i links)" % (status, url, l))
 		if l > 0:
 			cdb.enqueue(add_queue)	
 		cdb.addPage(processor.getDataDict())
 		processor.reset()
 
-	print "finishing..."
+	logger.info("Finishing...")
 	cdb.close()
-	print "done! goodbye!"
+	logger.info("Done! Goodbye!")
 
 if __name__ == "__main__":
 	try:
 		crawl()
 	except KeyboardInterrupt:
-		print "Stopping"
+		logger.error("Stopping (KeyboardInterrupt)")
 		sys.exit()
 	except Exception, e:
-		print "EXCEPTION: %s " % e
-
+		logger.error("EXCEPTION: %s " % e)
+		traceback.print_exc()
+
diff --git a/README.md b/README.md
@@ -4,15 +4,22 @@ Setup
 - DATABASE_ENGINE can either be "mysql" or "sqlite"
 - For sqlite only DATABASE_HOST is used, and it should begin with a '/'
 - All other DATABASE_* settings are required for mysql
-- VERBOSE mode causes the crawler to output some stats that are generated as it goes
+- DEBUG mode causes the crawler to output some stats that are generated as it goes, and other debug messages
+- LOGGING is a dictConfig dictionary to log output to the console and a rotating file, and works out-of-the-box, but can be modified
 
 
 Current State
 =============
 - mysql engine untested
-- Lots of debug prints
 - Issue in some situations where the database is locked and queries cannot execute. Presumably an issue only with sqlite's file-based approach
 
+Logging
+=======
+- DEBUG+ level messages are logged to the console, and INFO+ level messages are logged to a file.
+- By default, the file for logging uses a TimedRotatingFileHandler that rolls over at midnight
+- Setting DEBUG in the settings toggles wether or not DEBUG level messages are output at all
+- Setting USE_COLORS in the settings toggles whether or not messages output to the console use colors depending on the level.
+
 Misc
 ====
 - Designed to be able to run on multiple machines and work together to collect info in central DB

diff --git a/content_processor.py b/content_processor.py
@@ -1,8 +1,9 @@
-from ready_queue import ready_queue
-
 from multiprocessing import Pool
+import re, sys, logging
+
+from ready_queue import ready_queue
 
-import re, sys
+logger = logging.getLogger("crawler_logger")
 
 def rankKeywords(text):
 	invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"]
@@ -12,11 +13,9 @@ def rankKeywords(text):
 		if t in invalid_keywords:
 			continue
 		if not ranks.has_key(t):
-			#print "adding %s" % t
 			ranks[t] = 1
 		else:
 			ranks[t] += 1
-			#print "setting %s to %i" % (t, ranks[t])
 	return ranks
 
 def stripPunctuation(text):
@@ -68,34 +67,28 @@ def combineKeywordLists(self):
 			for k,v in l.items():
 				if self.keywords.has_key(k):
 					self.keywords[k] += v
-					#print "setting %s to %i" %(k,self.keywords[k])
 				else:
 					self.keywords[k] = v
-					#print "setting %s to %i" %(k,v)
 
 	# returns links to queue	
 	def processBody(self):
 		queue = ready_queue(self.url, self.body)
-		#print "found %i links to queue" % len(queue)
 		self.text = stripPunctuation(self.remove_html_tags(self.body))
 		if len(self.text) > 5000:
 			offset = 0
 			i = 0
 			l = []
-			#print "splitting text"
 			while True:
 				j = self.findnth(self.text[i:],' ',500)
 				offset += j
-				#print "SPLIT: 500th space at %i" % j
 				if j == -1:
-					#print "appending from %i on" % i
-					l.append(self.text[i:])
 					break
-				#print "appending from %i to %i" % (i,j)
 				l.append(self.text[i:j])
 				i = offset + j+1
-			#print "processing with %i threads" % len(l)
+			logger.debug("processing with %i threads" % len(l))
 			try:
+				if len(l) == 0:
+					return []
 				pool = Pool(processes=(len(l)))
 				self.keyword_dicts = pool.map(rankKeywords, l)
 			except KeyboardInterrupt:
@@ -105,7 +98,7 @@ def processBody(self):
 			else:
 				pool.close()
 				pool.join()
-			#print "processed, returned %i dicts" % len(self.keyword_dicts)
+			logger.debug("processed, returned %i dicts" % len(self.keyword_dicts))
 		else:
 			self.keyword_dicts.append(rankKeywords(self.text))
 		return queue
@@ -126,18 +119,11 @@ def findnth(self, haystack, needle, n):
 	# returns the queue from processBody
 	def process(self):
 		text_lower = self.text.lower()
-		#print "Finding title"
 		self.title = self.text[text_lower.find('<title')+6:text_lower.find('</title>')]
-		#print "Found title: %s" % self.title
-		#print "Finding head"
 		self.head = self.text[text_lower.find('<head')+5:text_lower.find('</head>')]
-		#print "Found head of length %i" % len(self.head)
 		self.processHead()
-		#print "Finding body"
 		self.body = self.text[text_lower.find('<body'):text_lower.find('</body>')]
-		#print "Found body of length %i" % len(self.body)
 		queue = self.processBody()
-		#print "combining keyword lists"
 		self.combineKeywordLists()
 		return queue
 

diff --git a/query.py b/query.py
@@ -100,7 +100,10 @@ def addPage(self, data):
 		if not self.connected:
 			return False
 		# Add the page to the crawl table
-		result = self.connection.execute(self.crawl_table.insert().values(address=unicode(data['address']),http_status=data['status'],title=unicode(data['title']),size=data['size']))
+		try:
+			result = self.connection.execute(self.crawl_table.insert().values(address=unicode(data['address']),http_status=data['status'],title=unicode(data['title']),size=data['size']))
+		except UnicodeDecodeError:
+			return False
 		if not result:
 			return False
 		# generate list of argument dictionaries for the insert many statement

diff --git a/settings.py b/settings.py
@@ -1,4 +1,4 @@
-# settings.py
+import logging
 
 DATABASE_ENGINE = "sqlite"		# sqlite or mysql
 DATABASE_NAME = "PyCrawler"		# Database name
@@ -7,4 +7,48 @@
 DATABASE_USER = ""				# Not used with sqlite
 DATABASE_PASS = ""				# Not used with sqlite
 
-VERBOSE = True
+DEBUG = True 					# Whether or not to show DEBUG level messages
+USE_COLORS = True 				# Whether or not colors should be used when outputting text
+
+LOGGING = {						# dictConfig for output stream and file logging
+	'version': 1,              
+    'disable_existing_loggers': False,
+
+	'formatters': {
+		'console': {
+			'format': '[%(asctime)s] %(levelname)s::%(module)s - %(message)s',
+		},
+		'file': {
+			'format': '[%(asctime)s] %(levelname)s::(P:%(process)d T:%(thread)d)::%(module)s - %(message)s',
+		},
+	},
+
+	'handlers': {
+		'console': {
+			'class': 'ColorStreamHandler.ColorStreamHandler',
+			'formatter':'console',
+			'level': 'DEBUG',
+			'use_colors': USE_COLORS,
+		},
+		'file': {
+			'class': 'logging.handlers.TimedRotatingFileHandler',
+			'formatter':'file',
+			'level': 'INFO',
+			'when': 'midnight',
+			'filename': 'pycrawler.log',
+			'interval': 1,
+			'backupCount': 0,
+			'encoding': None,
+			'delay': False,
+			'utc': False,
+		},
+	},
+
+	'loggers': {
+		'crawler_logger': {
+			'handlers': ['console', 'file'],
+			'level': 'DEBUG' if DEBUG else 'INFO',
+			'propagate': True,
+		},
+	}
+}