-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.js
51 lines (38 loc) · 1.05 KB
/
crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
function normalizeUrl(urlString) {
const urlObj = new URL(urlString);
const hostName = urlObj.hostname
const normalUrl = hostName.toLowerCase() + (urlObj.pathname).toLowerCase()
if (normalUrl.slice(-1) == "/") {
return normalUrl.slice(0, -1);
}
return normalUrl
}
function getURLFromHtml(htmlBody, baseUrl) {
const urls = []
const dom = new JSDOM(htmlBody)
let linkElements = dom.window.document.querySelectorAll("a")
for (const linkElement of linkElements) {
if (linkElement.href.slice(0, 1) === "/") {
//relative
try {
const urlObj = new URL(`${baseUrl}${linkElement.href}`)
urls.push(urlObj.href)
} catch (err) {
console.log(`error in the relative url ${err.message}`)
}
}
else {
//absolute
try {
const urlObj = new URL(linkElement.href)
urls.push(urlObj.href)
} catch (err) {
}
}
}
console.log(urls)
return urls
}
module.exports = { normalizeUrl, getURLFromHtml }