forked from franzenzenhofer/lighthouse-script
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.js
102 lines (88 loc) · 3.11 KB
/
crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import axios from 'axios';
import * as cheerio from 'cheerio';
async function fetchHTML(url) {
const { data: html } = await axios.get(url);
return html;
}
async function fetchSitemapLinks(url) {
const sitemapUrl = new URL('/sitemap.xml', url).toString();
const html = await fetchHTML(sitemapUrl);
const $ = cheerio.load(html, { xmlMode: true });
const urls = [];
$('url > loc, sitemap > loc').each((_, el) => {
urls.push($(el).text());
});
return urls;
}
async function fetchHTMLLinks(url) {
const html = await fetchHTML(url);
const $ = cheerio.load(html);
const urls = [];
$('a[href]').each((_, el) => {
const link = $(el).attr('href');
urls.push(new URL(link, url).toString());
});
return urls;
}
function makeURLAbsolute(url, baseUrl) {
const absoluteUrl = new URL(url, baseUrl);
absoluteUrl.hash = ''; // Remove the hash fragment
return absoluteUrl.toString();
}
function uniqueArray(array) {
return Array.from(new Set(array));
}
// Helper function to filter URLs from the same root domain
function filterSameRootDomainUrls(urls, rootUrl) {
const rootHostname = new URL(rootUrl).hostname;
return urls.filter((url) => {
const urlHostname = new URL(url).hostname;
return urlHostname === rootHostname;
});
}
// Helper function to remove common tracking parameters from URLs
function removeTrackingParameters(url) {
const parsedUrl = new URL(url);
const excludeParams = [
'utm_source',
'utm_medium',
'utm_campaign',
'utm_term',
'utm_content',
'fbclid',
'gclid',
];
excludeParams.forEach((param) => parsedUrl.searchParams.delete(param));
return parsedUrl.toString();
}
async function crawl(url) {
try {
const sitemapUrls = await fetchSitemapLinks(url);
if (sitemapUrls.length > 0) {
const filteredUrls = filterSameRootDomainUrls(sitemapUrls, url);
const cleanedUrls = filteredUrls.map(removeTrackingParameters);
const absoluteUrls = cleanedUrls.map((link) => makeURLAbsolute(link, url));
const uniqueUrls = uniqueArray(absoluteUrls);
return uniqueUrls.sort((a, b) => a.length - b.length);
} else {
console.log('No URLs found in the sitemap.');
console.log('Falling back to crawling HTML links...');
const htmlLinks = await fetchHTMLLinks(url);
const filteredUrls = filterSameRootDomainUrls(htmlLinks, url);
const cleanedUrls = filteredUrls.map(removeTrackingParameters);
const absoluteUrls = cleanedUrls.map((link) => makeURLAbsolute(link, url));
const uniqueUrls = uniqueArray(absoluteUrls);
return uniqueUrls.sort((a, b) => a.length - b.length);
}
} catch (error) {
console.log('Error fetching sitemap:', error.message);
console.log('Falling back to crawling HTML links...');
const htmlLinks = await fetchHTMLLinks(url);
const filteredUrls = filterSameRootDomainUrls(htmlLinks, url);
const cleanedUrls = filteredUrls.map(removeTrackingParameters);
const absoluteUrls = cleanedUrls.map((link) => makeURLAbsolute(link, url));
const uniqueUrls = uniqueArray(absoluteUrls);
return uniqueUrls.sort((a, b) => a.length - b.length);
}
}
export default crawl;