-
Notifications
You must be signed in to change notification settings - Fork 71
/
Copy pathjs_extractor.py
203 lines (147 loc) · 7.51 KB
/
js_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import re
import requests
import io
import os
import argparse
import sys
import subprocess
from datetime import datetime
import warnings
from requests.packages.urllib3.exceptions import InsecureRequestWarning
js_file = sys.stdin.readlines()
parser = argparse.ArgumentParser() ## accepting domain name as a parameter name for 3rd party domains excluding
parser.add_argument("-d", "--domain", help="domain to test", action="append")
domain_name = str(parser.parse_args().domain).replace("['", "").replace("']", "")
## Implementing reading from a specific file
#js_file = open("/path/test12.txt", "r").readlines()
## multiple files as a tuple
#js_files = [']
original_lines = []
all_endpoints_1st_lvl = []
js_files_2nd_lvl = []
all_endpoints_2nd_lvl = []
js_files_3rd_lvl = []
####
now = datetime.now()
now = str(now).replace(" ", "_").replace(":", "-")
now = re.sub("\..*?$", "", now)
curpath = os.path.abspath(os.curdir)
directory_with_js_files = "%s/js_files/%s/" % (curpath, now) ## directory of downloaded js files for other tools
###
def deduplication(input_file, original_lines):
existing_lines = []
for line in input_file: ## Deleting duplicates from the output of subjs (#$ and ?v=$)
line = re.sub("\\?v=.*?$", "", line)
line = re.sub("#.*?$", "", line)
existing_lines.append(line)
for line in existing_lines:
line = line.strip()
if line not in original_lines:
original_lines.append(line)
deduplication(js_file, original_lines)
def main_func(original_lines, js_files_2nd_lvl, all_endpoints_1st_lvl):
for line in original_lines: ## main loop
tld = re.sub("^[a-z]+\.", "", domain_name) ## matching TLD
t = re.findall("^(.*?)(?<=%s)" %tld, line)
clear_url = re.sub("\['|'\]", "", str(t)) ## matching URL without js part
if str(domain_name) not in str(line): ## excluding 3rd party js files & print 'em
print("3rd party JS file has been found: " + line)
continue
warnings.simplefilter('ignore', InsecureRequestWarning)
js_file_status = requests.head(line, verify=False).status_code ## fastly (HEAD) finding out a status code of js file url
if js_file_status == 200: ## if js file exists (to reduce time)
warnings.simplefilter('ignore', InsecureRequestWarning)
js_file_content = requests.get(line, verify=False) ## fetching js file's content
##Preparations for retire js check
####
#filename = "%s/%s" % (directory_with_js_files, name_for_wget)
#os.makedirs(os.path.dirname(filename)) ##creating dir with a js file
#js_file_write = open(filename, "w") ## it's for js file downloading
#js_file_write.write(js_file_content.text) ## wget for js file into the directory
###
u = re.findall("\"\/[a-zA-Z0-9_?&=/\-\#\.]*\"", js_file_content.text) ## matching "string"
u = str(u).replace("', '", "\n").replace("[]", "")
u = re.sub("\['|'\]|\"", "", u)
u = re.sub("^", clear_url, u, flags=re.M)
u = re.sub(
".css|.png|.jpg|.svg|.jpeg|.ico|.gif|.woff|.woff2|.swf", "", u,
flags=re.M) ## excluding not desirable file extensions
u = re.sub(".*?\.(facebook|twitter).(net|com)(/)|(/|/\?|/#|#)$", "", u,
flags=re.M) ##preparing for deduplication with / /? # deleting
u = re.sub("(\n\n)", "\n", u, flags=re.M)
u_lines = io.StringIO(u).readlines() ## endpoints
for one in u_lines:
if re.findall("\.js$", one):
if re.findall("^//", one): ## excluding 3rd party 2nd lvl js files & print 'em
print("3rd party JS file has been found: " + one)
if re.findall("^\b", one): ## if js file doesn't have / at ^, it'll be added
one = re.sub("^\b", clear_url + "/", one) # one = re.sub("\n", "", one)
js_files_2nd_lvl.append(one)
else: ## printing js files found on 2nd level
js_files_2nd_lvl.append(one)
else:
all_endpoints_1st_lvl.append(one) ## printing 1st lvl endpoints
elif js_file_status == 404:
print("JS file {} returned 404 code. Check the host and try to apply file upload with path traversal/PUT method file upload.".format(line))
main_func(original_lines, js_files_2nd_lvl, all_endpoints_1st_lvl)
temp = []
if len(all_endpoints_1st_lvl) != 0:
for l in all_endpoints_1st_lvl:
clear_domain = re.findall("http(s)://(.*)(?=/)", l)
clear_domain = re.findall(", '(.*?)'", str(clear_domain))
clear_domain = ''.join(clear_domain)
t = re.findall("^(.*?)(?<=com)", l)
l = re.sub("(/|/\?|/#|#|/\.)$", "", l) ## additionally deleting / /? /#
if not re.findall("%s$" % clear_domain, l): ## removing clear urls without actual endpoints like http(s)://domain.com
if "[]//" in l:
l = l.replace("[]//", "//%s" %clear_domain)
temp.append(l)
if not re.findall("%s/\W" %clear_domain, l): ## deleting endpoints containing non-word character (not a-z0-9) http(s)://domain.com/(.|[]{},
if not re.findall("%s/[a-z0-9]{1,2}$" % clear_domain, l): ## deleting endpoints containing 1-2 word character(s) like http(s)://domain.com/1|a|1a
temp.append(l) ## most likely to be an endpoint and not a javascript variable
all_endpoints_1st_lvl.clear() ## deleting current list w/ endpoints
all_endpoints_1st_lvl = temp ##substitution
all_endpoints_original = [] ## deleting dupes
deduplication(all_endpoints_1st_lvl, all_endpoints_original)
for l in all_endpoints_original: ## printing a list
if "[]" in l:
continue
else:
print(l)
js_files_2nd_lvl_original = []
if len(js_files_2nd_lvl) != 0:
print("\nJS files 2nd level:\n")
deduplication(js_files_2nd_lvl, js_files_2nd_lvl_original) ## removing dupes
for l in js_files_2nd_lvl_original: ## printing a list
j2 = re.findall("\.js$", l)
if len(j2) == 0:
continue
else:
print(l)
main_func(js_files_2nd_lvl_original, js_files_3rd_lvl, all_endpoints_2nd_lvl)
if js_files_3rd_lvl:
print("JS files 3rd level:\n")
js_files_3rd_lvl_original = []
deduplication(js_files_3rd_lvl, js_files_3rd_lvl_original) ## removing dupes
for l in js_files_3rd_lvl_original: ## printing a list
j3 = re.findall("\.js$", l)
if len(j3) == 0:
continue
else:
print(l)
if all_endpoints_2nd_lvl:
print("Endpoints 2nd level:\n")
all_endpoints_2nd_lvl_original = [] ## deleting dupes
deduplication(all_endpoints_2nd_lvl, all_endpoints_2nd_lvl_original)
for l in all_endpoints_2nd_lvl_original: ## printing a lists
if "[]" in l:
continue
else:
print(l)
## retire js check performing using os.system
#if os.path.exists(directory_with_js_files) is True:
#os.system("retire %s" % directory_with_js_files)
## Deleting duplicates from the js files 2nd level
## Deleting duplicates from the endpoints 1st level
## Deleting duplicates from the js files 3rdnd level
## Deleting duplicates from the endpoints 2nd level