Skip to content

Commit

Permalink
Keep running unless hit timeout
Browse files Browse the repository at this point in the history
  • Loading branch information
1tayH committed Jun 30, 2018
1 parent cedb0d6 commit e7b7967
Showing 1 changed file with 14 additions and 15 deletions.
29 changes: 14 additions & 15 deletions noisy.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,7 @@ def _browse_from_links(self, depth=0):
logging.debug("Exception on URL: %s, removing from list and trying again!" % random_link)
self._remove_and_blacklist(random_link)

depth += 1
self._browse_from_links(depth)
self._browse_from_links(depth + 1)

def load_config_file(self, file_path):
"""
Expand Down Expand Up @@ -217,22 +216,22 @@ def crawl(self):
"""
self._start_time = datetime.datetime.now()

for url in self._config["root_urls"]:
try:
body = self._request(url).content
self._links = self._extract_urls(body, url)
logging.debug("found {} links".format(len(self._links)))
self._browse_from_links()
while True:
for url in self._config["root_urls"]:
try:
body = self._request(url).content
self._links = self._extract_urls(body, url)
logging.debug("found {} links".format(len(self._links)))
self._browse_from_links()

except requests.exceptions.RequestException:
logging.warn("Error connecting to root url: {}".format(url))
continue
except requests.exceptions.RequestException:
logging.warn("Error connecting to root url: {}".format(url))

except self.CrawlerTimedOut:
logging.info("Timeout has exceeded, exiting")
return
except self.CrawlerTimedOut:
logging.info("Timeout has exceeded, exiting")
return

logging.debug("No more links were found")
logging.debug("No more links were found")


def main():
Expand Down

0 comments on commit e7b7967

Please sign in to comment.