Add timeout option

opexxx · Jun 29, 2018 · 914bea5 · 914bea5
1 parent 3c71ff1
commit 914bea5
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -38,12 +38,13 @@ python noisy.py --config config.json
 The program can accept a number of command line arguments:
 ```
 $ python noisy.py --help
-usage: noisy.py [-h] [--log -l] [--config -c]
+usage: noisy.py [-h] [--log -l] --config -c [--timeout -t]
 
 optional arguments:
-  -h, --help   show this help message and exit
-  --log -l     logging level
-  --config -c  config file
+  -h, --help    show this help message and exit
+  --log -l      logging level
+  --config -c   config file
+  --timeout -t  for how long the crawler should be running, in seconds
 ```
 only the config file argument is required.
 

diff --git a/config.json b/config.json
@@ -2,6 +2,7 @@
     "max_depth": 25,
     "min_sleep": 3,
     "max_sleep": 6,
+    "timeout": false,
     "root_urls": [
         "http://4chan.org",
         "https://www.reddit.com",

diff --git a/noisy.py b/noisy.py
@@ -7,6 +7,8 @@
 import json
 import urlparse
 import sys
+import datetime
+
 
 reload(sys)
 sys.setdefaultencoding('latin-1')
@@ -19,6 +21,13 @@ def __init__(self):
         """
         self._config = {}
         self._links = []
+        self._start_time = None
+
+    class CrawlerTimedOut(Exception):
+        """
+        Raised when the specified timeout is exceeded
+        """
+        pass
 
     def _request(self, url):
         """
@@ -129,6 +138,9 @@ def _browse_from_links(self, depth=0):
             # escape from the recursion, we don't have links to continue or we have reached the max depth
             return
 
+        if self._is_timeout_reached():
+            raise self.CrawlerTimedOut
+
         random_link = random.choice(self._links)
         try:
             logging.info("Visiting {}".format(random_link))
@@ -178,29 +190,57 @@ def set_config(self, config):
         """
         self._config = config
 
+    def set_option(self, option, value):
+        """
+        Sets a specific key in the config dict
+        :param option: the option key in the config, for example: "max_depth"
+        :param value: value for the option
+        """
+        self._config[option] = value
+
+    def _is_timeout_reached(self):
+        """
+        Determines whether the specified timeout has reached, if no timeout
+        is specified then return false
+        :return: boolean indicating whether the timeout has reached
+        """
+        is_timeout_set = self._config["timeout"] is not False  # False is set when no timeout is desired
+        end_time = self._start_time + datetime.timedelta(seconds=self._config["timeout"])
+        is_timed_out = datetime.datetime.now() >= end_time
+
+        return is_timeout_set and is_timed_out
+
     def crawl(self):
         """
         Collects links from our root urls, stores them and then calls
         `_browse_from_links` to browse them
         """
+        self._start_time = datetime.datetime.now()
+
         for url in self._config["root_urls"]:
             try:
                 body = self._request(url).content
                 self._links = self._extract_urls(body, url)
+                logging.debug("found {} links".format(len(self._links)))
+                self._browse_from_links()
+
             except requests.exceptions.RequestException:
                 logging.warn("Error connecting to root url: {}".format(url))
                 continue
 
-            logging.debug("found {} links".format(len(self._links)))
+            except self.CrawlerTimedOut:
+                logging.info("Timeout has exceeded, exiting")
+                return
 
-            self._browse_from_links()
         logging.debug("No more links were found")
 
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('--log', metavar='-l', type=str, help='logging level', default='info')
     parser.add_argument('--config', metavar='-c', required=True, type=str, help='config file')
+    parser.add_argument('--timeout', metavar='-t', required=False, type=int,
+                        help='for how long the crawler should be running, in seconds', default=False)
     args = parser.parse_args()
 
     level = getattr(logging, args.log.upper())
@@ -209,8 +249,10 @@ def main():
     crawler = Crawler()
     crawler.load_config_file(args.config)
 
-    while True:
-        crawler.crawl()
+    if args.timeout:
+        crawler.set_option('timeout', args.timeout)
+
+    crawler.crawl()
 
 
 if __name__ == '__main__':