forked from 1tayH/noisy
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnoisy.py
217 lines (182 loc) · 7.61 KB
/
noisy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
import requests
import re
import time
import random
import logging
import argparse
import json
import urlparse
import sys
reload(sys)
sys.setdefaultencoding('latin-1')
class Crawler(object):
def __init__(self):
"""
Initializes the Crawl class
"""
self._config = {}
self._links = []
def _request(self, url):
"""
Sends a POST/GET requests using a random user agent
:param url: the url to visit
:return: the response Requests object
"""
random_user_agent = random.choice(self._config["user_agents"])
headers = {'user-agent': random_user_agent}
response = requests.get(url, headers=headers, timeout=5)
return response
@staticmethod
def _normalize_link(link, root_url):
"""
Normalizes links extracted from the DOM by making them all absolute, so
we can request them, for example, turns a "/images" link extracted from https://imgur.com
to "https://imgur.com/images"
:param link: link found in the DOM
:param root_url: the URL the DOM was loaded from
:return: absolute link
"""
parsed_url = urlparse.urlparse(link)
parsed_root_url = urlparse.urlparse(root_url)
# '//' means keep the current protocol used to access this URL
if link.startswith("//"):
return "{}://{}{}".format(parsed_root_url.scheme, parsed_url.netloc, parsed_url.path)
# possibly a relative path
if not parsed_url.scheme:
return urlparse.urljoin(root_url, link)
return link
@staticmethod
def _is_valid_url(url):
"""
Check if a url is a valid url.
Used to filter out invalid values that were found in the "href" attribute,
for example "javascript:void(0)"
taken from https://stackoverflow.com/questions/7160737
:param url: url to be checked
:return: boolean indicating whether the URL is valid or not
"""
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
return re.match(regex, url) is not None
def _is_blacklisted(self, url):
"""
Checks is a URL is blacklisted
:param url: full URL
:return: boolean indicating whether a URL is blacklisted or not
"""
return any(blacklisted_url in url for blacklisted_url in self._config["blacklisted_urls"])
def _should_accept_url(self, url):
"""
filters url if it is blacklisted or not valid, we put filtering logic here
:param url: full url to be checked
:return: boolean of whether or not the url should be accepted and potentially visited
"""
return self._is_valid_url(url) and not self._is_blacklisted(url)
def _extract_urls(self, body, root_url):
"""
gathers links to be visited in the future from a web page's body.
does it by finding "href" attributes in the DOM
:param body: the HTML body to extract links from
:param root_url: the root URL of the given body
:return: list of extracted links
"""
pattern = r"href=[\"'](?!#)(.*?)[\"'].*?" # ignore links starting with #, no point in re-visiting the same page
urls = re.findall(pattern, str(body))
normalize_urls = [self._normalize_link(url, root_url) for url in urls]
filtered_urls = filter(self._should_accept_url, normalize_urls)
return filtered_urls
def _remove_and_blacklist(self, link):
"""
Removes a link from our current links list
and blacklists it so we don't visit it in the future
:param link: link to remove and blacklist
"""
self._config['blacklisted_urls'].append(link)
del self._links[self._links.index(link)]
def _browse_from_links(self, depth=0):
"""
Selects a random link out of the available link list and visits it.
Blacklists any link that is not responsive or that contains no other links.
Please note that this function is recursive and will keep calling itself until
a dead end has reached or when we ran out of links
:param depth: our current link depth
"""
is_depth_reached = depth >= self._config['max_depth']
if not len(self._links) or is_depth_reached:
logging.debug("Hit a dead end, moving to the next root URL")
# escape from the recursion, we don't have links to continue or we have reached the max depth
return
random_link = random.choice(self._links)
try:
logging.info("Visiting {}".format(random_link))
sub_page = self._request(random_link).content
sub_links = self._extract_urls(sub_page, random_link)
# sleep for a random amount of time
time.sleep(random.randrange(self._config["min_sleep"], self._config["max_sleep"]))
# make sure we have more than 1 link to pick from
if len(sub_links) > 1:
# extract links from the new page
self._links = self._extract_urls(sub_page, random_link)
else:
# else retry with current link list
# remove the dead-end link from our list
self._remove_and_blacklist(random_link)
except requests.exceptions.RequestException:
logging.debug("Exception on URL: %s, removing from list and trying again!" % random_link)
self._remove_and_blacklist(random_link)
depth += 1
self._browse_from_links(depth)
def load_config_file(self, file_path):
"""
Loads and decodes a JSON config file, sets the config of the crawler instance
to the loaded one
:param file_path: path of the config file
:return:
"""
with open(file_path, 'r') as config_file:
config = json.load(config_file)
self.set_config(config)
def set_config(self, config):
"""
Sets the config of the crawler instance to the provided dict
:param config: dict of configuration options, for example:
{
"root_urls": [],
"blacklisted_urls": [],
"click_depth": 5
...
}
"""
self._config = config
def crawl(self):
"""
Collects links from our root urls, stores them and then calls
`_browse_from_links` to browse them
"""
for url in self._config["root_urls"]:
try:
body = self._request(url).content
self._links = self._extract_urls(body, url)
except requests.exceptions.RequestException:
logging.warn("Error connecting to root url: {}".format(url))
continue
logging.debug("found {} links".format(len(self._links)))
self._browse_from_links()
logging.debug("No more links were found")
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--log', metavar='-l', type=str, help='logging level', default='info')
parser.add_argument('--config', metavar='-c', required=True, type=str, help='config file')
args = parser.parse_args()
level = getattr(logging, args.log.upper())
logging.basicConfig(level=level)
crawler = Crawler()
crawler.load_config_file(args.config)
while True:
crawler.crawl()
if __name__ == '__main__':
main()