Skip to content

Commit

Permalink
Add timeout option
Browse files Browse the repository at this point in the history
  • Loading branch information
1tayH committed Jun 29, 2018
1 parent 3c71ff1 commit 914bea5
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 8 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,13 @@ python noisy.py --config config.json
The program can accept a number of command line arguments:
```
$ python noisy.py --help
usage: noisy.py [-h] [--log -l] [--config -c]
usage: noisy.py [-h] [--log -l] --config -c [--timeout -t]
optional arguments:
-h, --help show this help message and exit
--log -l logging level
--config -c config file
-h, --help show this help message and exit
--log -l logging level
--config -c config file
--timeout -t for how long the crawler should be running, in seconds
```
only the config file argument is required.

Expand Down
1 change: 1 addition & 0 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"max_depth": 25,
"min_sleep": 3,
"max_sleep": 6,
"timeout": false,
"root_urls": [
"http://4chan.org",
"https://www.reddit.com",
Expand Down
50 changes: 46 additions & 4 deletions noisy.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import json
import urlparse
import sys
import datetime


reload(sys)
sys.setdefaultencoding('latin-1')
Expand All @@ -19,6 +21,13 @@ def __init__(self):
"""
self._config = {}
self._links = []
self._start_time = None

class CrawlerTimedOut(Exception):
"""
Raised when the specified timeout is exceeded
"""
pass

def _request(self, url):
"""
Expand Down Expand Up @@ -129,6 +138,9 @@ def _browse_from_links(self, depth=0):
# escape from the recursion, we don't have links to continue or we have reached the max depth
return

if self._is_timeout_reached():
raise self.CrawlerTimedOut

random_link = random.choice(self._links)
try:
logging.info("Visiting {}".format(random_link))
Expand Down Expand Up @@ -178,29 +190,57 @@ def set_config(self, config):
"""
self._config = config

def set_option(self, option, value):
"""
Sets a specific key in the config dict
:param option: the option key in the config, for example: "max_depth"
:param value: value for the option
"""
self._config[option] = value

def _is_timeout_reached(self):
"""
Determines whether the specified timeout has reached, if no timeout
is specified then return false
:return: boolean indicating whether the timeout has reached
"""
is_timeout_set = self._config["timeout"] is not False # False is set when no timeout is desired
end_time = self._start_time + datetime.timedelta(seconds=self._config["timeout"])
is_timed_out = datetime.datetime.now() >= end_time

return is_timeout_set and is_timed_out

def crawl(self):
"""
Collects links from our root urls, stores them and then calls
`_browse_from_links` to browse them
"""
self._start_time = datetime.datetime.now()

for url in self._config["root_urls"]:
try:
body = self._request(url).content
self._links = self._extract_urls(body, url)
logging.debug("found {} links".format(len(self._links)))
self._browse_from_links()

except requests.exceptions.RequestException:
logging.warn("Error connecting to root url: {}".format(url))
continue

logging.debug("found {} links".format(len(self._links)))
except self.CrawlerTimedOut:
logging.info("Timeout has exceeded, exiting")
return

self._browse_from_links()
logging.debug("No more links were found")


def main():
parser = argparse.ArgumentParser()
parser.add_argument('--log', metavar='-l', type=str, help='logging level', default='info')
parser.add_argument('--config', metavar='-c', required=True, type=str, help='config file')
parser.add_argument('--timeout', metavar='-t', required=False, type=int,
help='for how long the crawler should be running, in seconds', default=False)
args = parser.parse_args()

level = getattr(logging, args.log.upper())
Expand All @@ -209,8 +249,10 @@ def main():
crawler = Crawler()
crawler.load_config_file(args.config)

while True:
crawler.crawl()
if args.timeout:
crawler.set_option('timeout', args.timeout)

crawler.crawl()


if __name__ == '__main__':
Expand Down

0 comments on commit 914bea5

Please sign in to comment.