Skip to content

Commit

Permalink
Merge pull request theanti9#8 from WilHall/master
Browse files Browse the repository at this point in the history
Updates to PyCrawler
  • Loading branch information
theanti9 committed Nov 27, 2011
2 parents ca6d7a8 + d9ed584 commit fafb8c1
Show file tree
Hide file tree
Showing 7 changed files with 160 additions and 45 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
*.pyc
.DS_Store
*.db
*.db
*.log
68 changes: 68 additions & 0 deletions ColorStreamHandler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import logging
import curses

class ColorStreamHandler(logging.Handler):

def __init__(self, use_colors):
logging.Handler.__init__(self)
self.use_colors = use_colors

# Initialize environment
curses.setupterm()

# Get the foreground color attribute for this environment
self.fcap = curses.tigetstr('setaf')

#Get the normal attribute
self.COLOR_NORMAL = curses.tigetstr('sgr0')

# Get + Save the color sequences
self.COLOR_INFO = curses.tparm(self.fcap, curses.COLOR_GREEN)
self.COLOR_ERROR = curses.tparm(self.fcap, curses.COLOR_RED)
self.COLOR_WARNING = curses.tparm(self.fcap, curses.COLOR_YELLOW)
self.COLOR_DEBUG = curses.tparm(self.fcap, curses.COLOR_BLUE)

def color(self, msg, level):
if level == "INFO":
return "%s%s%s" % (self.COLOR_INFO, msg, self.COLOR_NORMAL)
elif level == "WARNING":
return "%s%s%s" % (self.COLOR_WARNING, msg, self.COLOR_NORMAL)
elif level == "ERROR":
return "%s%s%s" % (self.COLOR_ERROR, msg, self.COLOR_NORMAL)
elif level == "DEBUG":
return "%s%s%s" % (self.COLOR_DEBUG, msg, self.COLOR_NORMAL)
else:
return msg

def emit(self, record):
record.msg = record.msg.encode('utf-8', 'ignore')
msg = self.format(record)

# This just removes the date and milliseconds from asctime
temp = msg.split(']')
msg = '[' + temp[0].split(' ')[1].split(',')[0] + ']' + temp[1]

if self.use_colors:
msg = self.color(msg, record.levelname)
print msg

# 'record' has the following attributes:
# threadName
# name
# thread
# created
# process
# processName
# args
# module
# filename
# levelno
# exc_text
# pathname
# lineno
# msg
# exc_info
# funcName
# relativeCreated
# levelname
# msecs
40 changes: 23 additions & 17 deletions PyCrawler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from query import CrawlerDb
from content_processor import ContentProcessor
from settings import VERBOSE
import sys, urlparse, urllib2, robotparser
from settings import LOGGING
import sys, urlparse, urllib2, shutil, glob, robotparser
import logging, logging.config
import traceback

# ===== Init stuff =====

Expand All @@ -12,28 +14,32 @@
# content processor init
processor = ContentProcessor(None, None, None)

# logging setup
logging.config.dictConfig(LOGGING)
logger = logging.getLogger("crawler_logger")

# robot parser init
robot = robotparser.RobotFileParser()

if len(sys.argv) < 2:
print "Error: No start url was passed"
logger.info("Error: No start url was passed")
sys.exit()

l = sys.argv[1:]

cdb.enqueue(l)

def crawl():
print "starting..."
logger.info("Starting (%s)..." % sys.argv[1])
while True:
url = cdb.dequeue()
u = urlparse.urlparse(url)
robot.set_url('http://'+u[1]+"/robots.txt")
if not robot.can_fetch('PyCrawler', url):
print "Url disallowed by robots.txt: %s " % url
if not robot.can_fetch('PyCrawler', url.encode('ascii', 'replace')):
logger.warning("Url disallowed by robots.txt: %s " % url)
continue
if not url.startswith('http'):
print "Unfollowable link found at %s " % url
logger.warning("Unfollowable link found at %s " % url)
continue

if cdb.checkCrawled(url):
Expand All @@ -48,8 +54,7 @@ def crawl():
try:
request = urllib2.urlopen(req)
except urllib2.URLError, e:
print e
print "Exception at url: %s" % url
logger.error("Exception at url: %s\n%s" % (url, e))
continue
except urllib2.HTTPError, e:
status = e.code
Expand All @@ -65,25 +70,26 @@ def crawl():
if not cdb.checkCrawled(q):
add_queue.append(q)

processor.setInfo(str(url), status, data)
add_queue = processor.process()
l = len(add_queue)
if VERBOSE:
print "Got %s status from %s" % (status, url)
print "Found %i links" % l
logger.info("Got %s status from %s (Found %i links)" % (status, url, l))
if l > 0:
cdb.enqueue(add_queue)
cdb.addPage(processor.getDataDict())
processor.reset()

print "finishing..."
logger.info("Finishing...")
cdb.close()
print "done! goodbye!"
logger.info("Done! Goodbye!")

if __name__ == "__main__":
try:
crawl()
except KeyboardInterrupt:
print "Stopping"
logger.error("Stopping (KeyboardInterrupt)")
sys.exit()
except Exception, e:
print "EXCEPTION: %s " % e

logger.error("EXCEPTION: %s " % e)
traceback.print_exc()

11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,22 @@ Setup
- DATABASE_ENGINE can either be "mysql" or "sqlite"
- For sqlite only DATABASE_HOST is used, and it should begin with a '/'
- All other DATABASE_* settings are required for mysql
- VERBOSE mode causes the crawler to output some stats that are generated as it goes
- DEBUG mode causes the crawler to output some stats that are generated as it goes, and other debug messages
- LOGGING is a dictConfig dictionary to log output to the console and a rotating file, and works out-of-the-box, but can be modified


Current State
=============
- mysql engine untested
- Lots of debug prints
- Issue in some situations where the database is locked and queries cannot execute. Presumably an issue only with sqlite's file-based approach

Logging
=======
- DEBUG+ level messages are logged to the console, and INFO+ level messages are logged to a file.
- By default, the file for logging uses a TimedRotatingFileHandler that rolls over at midnight
- Setting DEBUG in the settings toggles wether or not DEBUG level messages are output at all
- Setting USE_COLORS in the settings toggles whether or not messages output to the console use colors depending on the level.

Misc
====
- Designed to be able to run on multiple machines and work together to collect info in central DB
Expand Down
30 changes: 8 additions & 22 deletions content_processor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from ready_queue import ready_queue

from multiprocessing import Pool
import re, sys, logging

from ready_queue import ready_queue

import re, sys
logger = logging.getLogger("crawler_logger")

def rankKeywords(text):
invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"]
Expand All @@ -12,11 +13,9 @@ def rankKeywords(text):
if t in invalid_keywords:
continue
if not ranks.has_key(t):
#print "adding %s" % t
ranks[t] = 1
else:
ranks[t] += 1
#print "setting %s to %i" % (t, ranks[t])
return ranks

def stripPunctuation(text):
Expand Down Expand Up @@ -68,34 +67,28 @@ def combineKeywordLists(self):
for k,v in l.items():
if self.keywords.has_key(k):
self.keywords[k] += v
#print "setting %s to %i" %(k,self.keywords[k])
else:
self.keywords[k] = v
#print "setting %s to %i" %(k,v)

# returns links to queue
def processBody(self):
queue = ready_queue(self.url, self.body)
#print "found %i links to queue" % len(queue)
self.text = stripPunctuation(self.remove_html_tags(self.body))
if len(self.text) > 5000:
offset = 0
i = 0
l = []
#print "splitting text"
while True:
j = self.findnth(self.text[i:],' ',500)
offset += j
#print "SPLIT: 500th space at %i" % j
if j == -1:
#print "appending from %i on" % i
l.append(self.text[i:])
break
#print "appending from %i to %i" % (i,j)
l.append(self.text[i:j])
i = offset + j+1
#print "processing with %i threads" % len(l)
logger.debug("processing with %i threads" % len(l))
try:
if len(l) == 0:
return []
pool = Pool(processes=(len(l)))
self.keyword_dicts = pool.map(rankKeywords, l)
except KeyboardInterrupt:
Expand All @@ -105,7 +98,7 @@ def processBody(self):
else:
pool.close()
pool.join()
#print "processed, returned %i dicts" % len(self.keyword_dicts)
logger.debug("processed, returned %i dicts" % len(self.keyword_dicts))
else:
self.keyword_dicts.append(rankKeywords(self.text))
return queue
Expand All @@ -126,18 +119,11 @@ def findnth(self, haystack, needle, n):
# returns the queue from processBody
def process(self):
text_lower = self.text.lower()
#print "Finding title"
self.title = self.text[text_lower.find('<title')+6:text_lower.find('</title>')]
#print "Found title: %s" % self.title
#print "Finding head"
self.head = self.text[text_lower.find('<head')+5:text_lower.find('</head>')]
#print "Found head of length %i" % len(self.head)
self.processHead()
#print "Finding body"
self.body = self.text[text_lower.find('<body'):text_lower.find('</body>')]
#print "Found body of length %i" % len(self.body)
queue = self.processBody()
#print "combining keyword lists"
self.combineKeywordLists()
return queue

Expand Down
5 changes: 4 additions & 1 deletion query.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,10 @@ def addPage(self, data):
if not self.connected:
return False
# Add the page to the crawl table
result = self.connection.execute(self.crawl_table.insert().values(address=unicode(data['address']),http_status=data['status'],title=unicode(data['title']),size=data['size']))
try:
result = self.connection.execute(self.crawl_table.insert().values(address=unicode(data['address']),http_status=data['status'],title=unicode(data['title']),size=data['size']))
except UnicodeDecodeError:
return False
if not result:
return False
# generate list of argument dictionaries for the insert many statement
Expand Down
48 changes: 46 additions & 2 deletions settings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# settings.py
import logging

DATABASE_ENGINE = "sqlite" # sqlite or mysql
DATABASE_NAME = "PyCrawler" # Database name
Expand All @@ -7,4 +7,48 @@
DATABASE_USER = "" # Not used with sqlite
DATABASE_PASS = "" # Not used with sqlite

VERBOSE = True
DEBUG = True # Whether or not to show DEBUG level messages
USE_COLORS = True # Whether or not colors should be used when outputting text

LOGGING = { # dictConfig for output stream and file logging
'version': 1,
'disable_existing_loggers': False,

'formatters': {
'console': {
'format': '[%(asctime)s] %(levelname)s::%(module)s - %(message)s',
},
'file': {
'format': '[%(asctime)s] %(levelname)s::(P:%(process)d T:%(thread)d)::%(module)s - %(message)s',
},
},

'handlers': {
'console': {
'class': 'ColorStreamHandler.ColorStreamHandler',
'formatter':'console',
'level': 'DEBUG',
'use_colors': USE_COLORS,
},
'file': {
'class': 'logging.handlers.TimedRotatingFileHandler',
'formatter':'file',
'level': 'INFO',
'when': 'midnight',
'filename': 'pycrawler.log',
'interval': 1,
'backupCount': 0,
'encoding': None,
'delay': False,
'utc': False,
},
},

'loggers': {
'crawler_logger': {
'handlers': ['console', 'file'],
'level': 'DEBUG' if DEBUG else 'INFO',
'propagate': True,
},
}
}

0 comments on commit fafb8c1

Please sign in to comment.