Keep running unless hit timeout

netredo · Jun 30, 2018 · e7b7967 · e7b7967
1 parent cedb0d6
commit e7b7967
Showing 1 changed file with 14 additions and 15 deletions.
diff --git a/noisy.py b/noisy.py
@@ -163,8 +163,7 @@ def _browse_from_links(self, depth=0):
             logging.debug("Exception on URL: %s, removing from list and trying again!" % random_link)
             self._remove_and_blacklist(random_link)
 
-        depth += 1
-        self._browse_from_links(depth)
+        self._browse_from_links(depth + 1)
 
     def load_config_file(self, file_path):
         """
@@ -217,22 +216,22 @@ def crawl(self):
         """
         self._start_time = datetime.datetime.now()
 
-        for url in self._config["root_urls"]:
-            try:
-                body = self._request(url).content
-                self._links = self._extract_urls(body, url)
-                logging.debug("found {} links".format(len(self._links)))
-                self._browse_from_links()
+        while True:
+            for url in self._config["root_urls"]:
+                try:
+                    body = self._request(url).content
+                    self._links = self._extract_urls(body, url)
+                    logging.debug("found {} links".format(len(self._links)))
+                    self._browse_from_links()
 
-            except requests.exceptions.RequestException:
-                logging.warn("Error connecting to root url: {}".format(url))
-                continue
+                except requests.exceptions.RequestException:
+                    logging.warn("Error connecting to root url: {}".format(url))
 
-            except self.CrawlerTimedOut:
-                logging.info("Timeout has exceeded, exiting")
-                return
+                except self.CrawlerTimedOut:
+                    logging.info("Timeout has exceeded, exiting")
+                    return
 
-        logging.debug("No more links were found")
+            logging.debug("No more links were found")
 
 
 def main():