Skip to content

Commit

Permalink
Automatically checking for sitemap existence in case of --crawl
Browse files Browse the repository at this point in the history
  • Loading branch information
stamparm committed Jan 20, 2015
1 parent a603002 commit 9f4a32c
Showing 1 changed file with 21 additions and 0 deletions.
21 changes: 21 additions & 0 deletions lib/utils/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from lib.core.settings import UNICODE_ENCODING
from lib.core.threads import getCurrentThreadData
from lib.core.threads import runThreads
from lib.parse.sitemap import parseSitemap
from lib.request.connect import Connect as Request
from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup
from thirdparty.oset.pyoset import oset
Expand Down Expand Up @@ -116,6 +117,26 @@ def crawlThread():
threadData.shared.deeper = set()
threadData.shared.unprocessed = set([target])

if not conf.sitemapUrl:
message = "do you want to check for the existence of "
message += "site's sitemap(.xml) [Y/n] "
test = readInput(message, default="Y")
if test[0] not in ("n", "N"):
items = None
url = "%s://%s/sitemap.xml" % (conf.scheme, conf.hostname)
try:
items = parseSitemap(url)
except:
pass
finally:
if items:
for item in items:
if re.search(r"(.*?)\?(.+)", item):
threadData.shared.value.add(item)
if conf.crawlDepth > 1:
threadData.shared.unprocessed.update(items)
logger.info("%s links found" % ("no" if not items else len(items)))

infoMsg = "starting crawler"
if conf.bulkFile:
infoMsg += " for target URL '%s'" % target
Expand Down

0 comments on commit 9f4a32c

Please sign in to comment.