-
Notifications
You must be signed in to change notification settings - Fork 463
/
Copy pathgenerate-custom-parser.js
executable file
·149 lines (128 loc) · 3.94 KB
/
generate-custom-parser.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
/* eslint-disable import/no-extraneous-dependencies */
/* eslint-disable no-use-before-define */
/* eslint-disable no-console */
import fs from 'fs';
import URL from 'url';
import inquirer from 'inquirer';
import ora from 'ora';
import { exec } from 'child_process';
import { stripJunkTags, makeLinksAbsolute } from 'utils/dom';
import Parser from '../dist/mercury';
import extractorTemplate from './templates/custom-extractor';
import extractorTestTemplate from './templates/custom-extractor-test';
const questions = [
{
type: 'input',
name: 'website',
message:
"Paste a url to an article you'd like to create or extend a parser for:",
validate(value) {
const { hostname } = URL.parse(value);
if (hostname) return true;
return false;
},
},
];
let spinner;
function confirm(fn, args, msg, newParser) {
spinner = ora({ text: msg });
spinner.start();
const result = fn(...args);
if (result && result.then) {
result.then(r => savePage(r, args, newParser));
} else {
spinner.succeed();
}
return result;
}
function confirmCreateDir(dir, msg) {
if (!fs.existsSync(dir)) {
confirm(fs.mkdirSync, [dir], msg);
}
}
function getDir(url) {
const { hostname } = URL.parse(url);
return `./src/extractors/custom/${hostname}`;
}
function scaffoldCustomParser(url) {
const dir = getDir(url);
const { hostname } = URL.parse(url);
let newParser = false;
if (!fs.existsSync(dir)) {
newParser = true;
confirmCreateDir(dir, `Creating ${hostname} directory`);
confirmCreateDir(`./fixtures/${hostname}`, 'Creating fixtures directory');
}
confirm(Parser.fetchResource, [url], 'Fetching fixture', newParser);
}
// if has arg, just assume that arg is a url and skip prmopt
const urlArg = process.argv[2];
if (urlArg) {
scaffoldCustomParser(urlArg);
} else {
inquirer.prompt(questions).then(answers => {
scaffoldCustomParser(answers.website);
});
}
function generateScaffold(url, file, result) {
const { hostname } = URL.parse(url);
const extractor = extractorTemplate(hostname, extractorName(hostname));
const extractorTest = extractorTestTemplate(
file,
url,
getDir(url),
result,
extractorName(hostname)
);
fs.writeFileSync(`${getDir(url)}/index.js`, extractor);
fs.writeFileSync(`${getDir(url)}/index.test.js`, extractorTest);
fs.appendFileSync('./src/extractors/custom/index.js', exportString(url));
exec(`npm run lint-fix-quiet -- ${getDir(url)}/*.js`);
}
function savePage($, [url], newParser) {
const { hostname } = URL.parse(url);
spinner.succeed();
const filename = new Date().getTime();
const file = `./fixtures/${hostname}/${filename}.html`;
// fix http(s) relative links:
makeLinksAbsolute($('*').first(), $, url);
$('[src], [href]').each((index, node) => {
const $node = $(node);
const link = $node.attr('src');
if (link && link.slice(0, 2) === '//') {
$node.attr('src', `http:${link}`);
}
});
const html = stripJunkTags($('*').first(), $, ['script']).html();
fs.writeFileSync(file, html);
Parser.parse(url, { html }).then(result => {
if (newParser) {
confirm(
generateScaffold,
[url, file, result],
'Generating parser and tests'
);
console.log(`Your custom site extractor has been set up. To get started building it, run
yarn watch:test -- ${hostname}
-- OR --
npm run watch:test -- ${hostname}`);
} else {
console.log(`
It looks like you already have a custom parser for this url.
The page you linked to has been added to ${file}. Copy and paste
the following code to use that page in your tests:
const html = fs.readFileSync('${file}');`);
}
});
}
function exportString(url) {
const { hostname } = URL.parse(url);
return `export * from './${hostname}';`;
}
function extractorName(hostname) {
const name = hostname
.split('.')
.map(w => `${w.charAt(0).toUpperCase()}${w.slice(1)}`)
.join('');
return `${name}Extractor`;
}