-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathserver.js
98 lines (92 loc) · 2.63 KB
/
server.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
const express = require("express")
const robots = require('express-robots-txt');
const puppeteer = require("puppeteer");
const useragent = require('express-useragent');
const targetHost = "https://blog.nano71.com"
const app = express()
const port = 9001
let browserWSEndpoint
let browser
let timer
app.use(robots([
{
UserAgent: "*",
Disallow: ["/*.css$", "/*.js$", "/*.gif$"],
Sitemap: []
}
]));
app.use(useragent.express())
app.get('*', async (request, response) => {
const userAgent = request.useragent;
const url = request.url
if (!url.includes(".")) {
response.contentType("text/html")
let html = await fetchHTML(url, `bot(${userAgent.source.split('').reverse().join("")})`)
if (html.includes("The article is non-existent.")) {
response.status(404).send()
} else
response.send(html)
closeBrowser()
} else {
console.log("skip", url)
response.status(400).send("Bad Request")
}
});
// 启动 Express应用程序, 监听指定的端口号
app.listen(port, undefined, () => {
console.log("\n");
console.log("----------------------START----------------------\n");
console.log(`Service listening on port ${port}`)
})
/**
* 获取网页内容
* @param {string} url
* @param {string} ua
* @return Promise<string>
*/
async function fetchHTML(url, ua) {
if (browserWSEndpoint) {
browser = await puppeteer.connect({
browserWSEndpoint
})
} else {
browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
browserWSEndpoint = browser.wsEndpoint()
}
console.log("fetchHTML:", url);
const page = await browser.newPage();
await page.setRequestInterception(true);
await page.setUserAgent(ua)
page.setDefaultTimeout(60000)
page.on('request', (request) => {
const path = new URL(request.url()).pathname;
if (path.includes(".") && !path.endsWith(".js")) {
// 中止请求
request.abort();
} else {
// 继续请求
request.continue();
}
});
console.log(1);
await page.goto(targetHost + url, {
waitUntil: 'networkidle0'
});
console.log(2);
let html = await page.content();
await page.close()
console.log("end");
return html
}
function closeBrowser() {
clearTimeout(timer)
timer = setTimeout(async () => {
if ((await browser.pages()).length === 1) {
browserWSEndpoint = undefined
await browser.close();
}
}, 5000)
}