-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
86 lines (65 loc) · 2.42 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
const fs = require('fs')
const puppeteer = require('puppeteer')
const BASE_URL = 'https://www.wofford.edu/about/news/news-archives/'
const years = ['2019']
years.map((year) => scrape(year))
async function scrape(year) {
const browser = await puppeteer.launch({ headless: true })
const [page] = await browser.pages()
const SCRAPE_URL = `${BASE_URL}${year}/`
await page.goto(SCRAPE_URL, { waitUntil: 'networkidle0' })
const pageUrls = await page.evaluate((SCRAPE_URL) => {
const urlArray = Array.from(document.links).map((link) => link.href)
const newsUrl = urlArray.filter((link) => link.startsWith(SCRAPE_URL))
const uniqueUrlArray = [...new Set(newsUrl)]
return uniqueUrlArray
}, SCRAPE_URL)
fs.mkdir(year, { recursive: true }, (err) => {
if (err) throw err
})
await Promise.allSettled(
pageUrls.map(async (url, i) => {
const page = await browser.newPage()
await page.goto(url, {
waitUntil: 'networkidle0',
})
const postDateEl = await page.waitForSelector('.post-date')
const postDate = await postDateEl.evaluate((el) => el.textContent)
const fileName = postDate + '-' + url.substring(url.lastIndexOf('/') + 1)
await page.pdf({
format: 'A4',
margin: {
top: 48,
bottom: 48,
left: 48,
right: 48,
},
path: `${year}/${fileName}.pdf`,
})
await page.close()
})
)
await browser.close()
}
// async function singleScrape() {
// const browser = await puppeteer.launch({ headless: true })
// const page = await browser.newPage()
// const year = '2019'
// const fileName = 'gonzaleslecture'
// await page.goto(`https://www.wofford.edu/about/news/news-archives/${year}/${fileName}`, { waitUntil: 'networkidle0' })
// const postDateEl = await page.waitForSelector('.post-date')
// const postDate = await postDateEl.evaluate((el) => el.textContent)
// await page.pdf({
// format: 'A4',
// margin: {
// top: 48,
// bottom: 48,
// left: 48,
// right: 48,
// },
// path: `${year}/${postDate}-${fileName}.pdf`,
// })
// await page.close()
// await browser.close()
// }
// singleScrape()