This repository has been archived by the owner on Jun 6, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
webscrape.py
executable file
·190 lines (161 loc) · 9.11 KB
/
webscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#!/usr/bin/python3
# todo
# dead link finder flag
# detect when page is taking too long to respond
# import libraries
import urllib.request # url requesting library
import re # regular expression library
import argparse # argument parser library
import sys # sys library
import urllib.parse # url parsing library
from collections import deque # list alternitive
# parse comand line arguments
parser = argparse.ArgumentParser() # gives the argument parser a shorter name
parser.add_argument("url", help="must be the full url eg.\"http://usd.edu\"", type=str) # add arguments
parser.add_argument("-i", "--iterative", help="search webpages that are linked on the given one", action="store_true")
parser.add_argument("-f", help="file to send results to", type=str)
parser.add_argument("-v", help="increases verbosity", action='count')
parser.add_argument("-e", "--expression", help="the regex to use, if not specified, use one that matches email addresses", type=str)
parser.add_argument("-u", help="include the url of the page that the regex was found on", action="store_true")
parser.add_argument("-c", "--count", help="the number of pages to look through", type=int)
parser.add_argument("-b", help="--comma sepparated list of netlocs that will not be looked through", type=str)
parser.add_argument("-w", help="--comma sepparated list of netlocs that will only be looked through", type=str)
args = parser.parse_args() # parse the arguments for use
if args.v: # if there are more than 0 verbose flags
def vPrint(*stuff): # define a function to print verbosity messages, the * allows the function to accept more than one item
print(*stuff)
else:
def vPrint(*stuff): # define a function that does not print verbosity messages
None # do nothing
def ePrint(*stuff): # define a function to print errors
print(*stuff, file=sys.stderr) # print to stderr
if args.f: # if the file option is specified
def sendToFile(*stuff): # define a function to send data to a file
try:
f = open(args.f, "a") # open the file for appending
for thing in stuff: # parse through all the items to send
f.writelines(thing) # write each item to the file
f.writelines(' ') # write a space char.
f.write('\n') # after writing all the items, add a new line
f.close() # and close the file
except Exception as err: # if some thing breaks, put the message in err
ePrint("Failed to write to file because:", err) # print it to stderr, but do not raise the error
else: # if the file option was not specified
def sendToFile(*stuff): # create a dummy function
None
def parseForRegex(text, compiledRegex): # define a function
'''checks for regex in multiline text'''
matches = []
for line in text: # parse through all the lines in the text
line = str(line) # make sure they are strings
match = compiledRegex.search(line) # look for a match on the line using compiledRegex
if match:
matches.append(match.group(0)) # add the match to the matches list
return matches # return the list of matches
def hrefParser(text): # define a function
'''checks for href links in multiline text'''
refs = []
hrefExp = r"(\s)(?#matches any white space)+href=(?#matches the href= part)[\w\"-._~:/?()#@!$&\'*+,;=%\\\[\]]+(?#matches any alphanumeric and the other chars)([\s>])(?#matches any white space or the close of the tag)"
# define the regex eg. " href=page.dfsaf(34532%232fdawefd4$#@ " would match, note the inclusion of spaces(any whitespace will work) before and after the href attrib.
# See https://docs.python.org/3/howto/regex.html#the-backslash-plague for the reason why there is an r in front of the string
hrefExpC = re.compile(hrefExp) # compile the regex for faster proccessing
for line in text: # parse through all the lines in the text
line = str(line) # make sure the line is a string
match = hrefExpC.search(line) # find a match
if match:
link = match.group(0).strip().lstrip("href=").rstrip(">").strip("\"").strip("\'") # clean up the match
refs.append(link) # append it to the list
return refs # return the list
def pageDownloader(url): # define a function to return a list of lines given a uri
try:
pageObj = urllib.request.urlopen(url) # create a page object from the url
page = pageObj.readlines() # read all of the lines into a list
except Exception as err: # if something breaks, eg. a 404 error, put the error in err
ePrint("Failed to get",url,"becuase",err) # print message to stderr, but do not raise it
return [] # return an empty list if error
return page # return the list
# handle an optional expression
if args.expression: # if a different expression is specified on the comand line
expression = args.expression
expression = re.compile(expression) #compile the expression to make it faster
else: # if no expression is specified
expression = re.compile('[a-zA-Z0-9]+[a-zA-Z0-9_\.-]*@[a-zA-Z0-9][a-zA-Z0-9_\.-]+(\.[a-zA-Z]{2,})') # compile an expreson that looks for email addresses
if args.w and args.b: # if bolth a whitelist and a blacklist are specified
ePrint('-b and -w can not be specified together') # print an error
exit() # and exit
def skipLoc(netloc): # define a function
return False # always be False
if args.w: # if a whitelist is specified
whitelist = set(args.w.split(',')) # split it up into a set
def skipLoc(netloc): # return True for continue or False for add to todo
if netloc in whitelist:
return False
else:
return True
if args.b: # if there is a blacklist
blacklist = set(args.b.split(',')) # split it into a set
def skipLoc(netloc): # return True to continue, False to add to the todo list
if netloc in blacklist:
return True
else:
return False
if args.iterative:
todo = deque()
todo.append(args.url)
matches = []
seen = set()
count = 0
while 1:
try:
link = deque.pop(todo) # remove a link from todo and put it in link
except IndexError: # if there was an IndexError (probably because there are no more items in the deque
print('looks like there is no more pages to proccess')
break
if args.count: # if there should be a limit
if count == args.count: # if the limit is hit
print("iteratitve matches:", matches) # print matches
break # stop
if link in seen: # if the link has been seen before
continue # skip it
page = pageDownloader(link) # use pageDownloader to create a list of lines in the page
for href in hrefParser(page): # look through the page for hrefs
(scheme, netloc, path, dumy, dumy, dumy) = urllib.parse.urlparse(href) # split the found uri
if skipLoc(netloc): # if listCheck says True then ignore the href
continue
(oldScheme, oldNetloc, oldPath, dumy, dumy, dumy) = urllib.parse.urlparse(link) # split the current uri
if scheme == "": # if no scheme
scheme = oldScheme # use the old one
if netloc == "": # if no netloc
netloc = oldNetloc # use old one
scheme = scheme + "://" # fix the scheme
oldScheme = oldScheme + "://" # fix the old scheme
if path == "": # if there is no path
path = "/" # the path should be /
# handle relative paths
if path[0] == '/': # relative to the site root
href = scheme+netloc+path # concatanate the uri together
else: # not relative to the site root
if path[0] == '.': # if there reference a with a dot at 0 in the path
ePrint("Warrning,", link , "is confusing me because of paths with a '.' in them.") # tell the user that the script is broken and skip that one
continue
else:
href = scheme+netloc+oldPath+"/"+path
if href not in seen:
todo.appendleft(href)
vPrint(href, "found,",len(todo), "pages found so far")
vPrint(len(seen), "pages have been proccesed")
for rMatch in parseForRegex(page,expression): # look through all the matches
vPrint("found", rMatch,len(matches), "in list so far")
matches.append(rMatch) # add them to the list
if args.u: # if the user wants a url in the file with the match
sendToFile(rMatch,link)
else:
sendToFile(rMatch)
seen.add(link)
count = count + 1
print("iteratitve matches:", matches)
else:
matches = parseForRegex(pageDownloader(args.url),expression) # find matches
print("matches:",matches) # print them out
for item in matches: # send each of them to file
sendToFile(item)