Skip to content

Commit

Permalink
Merge pull request 1tayH#15 from ail1020/patch-1
Browse files Browse the repository at this point in the history
randomize order of root_urls
  • Loading branch information
1tayH authored Aug 19, 2018
2 parents 91c3927 + cfcc557 commit 2815487
Showing 1 changed file with 16 additions and 19 deletions.
35 changes: 16 additions & 19 deletions noisy.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,28 +230,25 @@ def crawl(self):
self._start_time = datetime.datetime.now()

while True:
for url in self._config["root_urls"]:
try:
body = self._request(url).content
self._links = self._extract_urls(body, url)
logging.debug("found {} links".format(len(self._links)))
self._browse_from_links()

except requests.exceptions.RequestException:
logging.warn("Error connecting to root url: {}".format(url))
url = random.choice(self._config["root_urls"])
try:
body = self._request(url).content
self._links = self._extract_urls(body, url)
logging.debug("found {} links".format(len(self._links)))
self._browse_from_links()

except requests.exceptions.RequestException:
logging.warn("Error connecting to root url: {}".format(url))

except MemoryError:
logging.warn("Error: content at url: {} is exhausting the memory".format(url))
except MemoryError:
logging.warn("Error: content at url: {} is exhausting the memory".format(url))

except LocationParseError:
logging.warn("Error encountered during parsing of: {}".format(url))

except self.CrawlerTimedOut:
logging.info("Timeout has exceeded, exiting")
return

logging.debug("No more links were found")
except LocationParseError:
logging.warn("Error encountered during parsing of: {}".format(url))

except self.CrawlerTimedOut:
logging.info("Timeout has exceeded, exiting")
return

def main():
parser = argparse.ArgumentParser()
Expand Down

0 comments on commit 2815487

Please sign in to comment.