Skip to content

Commit

Permalink
minor refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
kayslay committed Mar 12, 2019
1 parent c251a6a commit 5df2933
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 30 deletions.
55 changes: 26 additions & 29 deletions module/crawl.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,46 +5,32 @@ const _ = require("lodash")
const crawlUrls = require('./crawlUrls')

/**
* @description creates a crawler from a given config
* @description creates a new instance of a crawler from the config passed parameter passed
* @param {Object} config the configuration objects
*/
function createCrawler(config = {}) {
const {
initCrawl
} = crawlUrls()

let nextLinks = [];
let gen;

let urls, finalFn, depthFn, depth, limitNextLinks, nextCrawlWait;

//
function defaultDepthFn(data) {
// console.log("---depthFn called---")
}

function defaultFinalFn(err) {
if (err) throw err
return
}

//immediately configure the crawl
(function (config = {}) {
({
urls = [],
let {
urls = [],
finalFn = defaultFinalFn,
depthFn = defaultDepthFn,
depth = 1,
limitNextLinks,
nextCrawlWait = 0, //rate limit in what
} = config);
nextLinks = nextLinks.concat(urls);
})(config);
} = config
nextLinks = urls;

/**
* @description crawls a single depth level
* @description crawls a complete step for a list of links. it crawls a single depth
* it yield control to the depthCrawl when it's done. calls depthFn if the step was successful
*/
function crawl() {
function singleDepthCrawl() {

initCrawl(nextLinks, config)
.then(scrapedData => {
Expand All @@ -60,13 +46,14 @@ function createCrawler(config = {}) {
}

/**
* @description generator that handle each depth level of the crawl
* @param {function(Object)} resolve
* @param {function(Object)} reject
* @description function crawls till the depth is reached, an error occurs or there is no
* more link to crawl.
* @param {function(Object)} resolve called when the complete successfully
* @param {function(Object)} reject called when an error occurs
*/
function* crawlGen(resolve, reject) {
function* depthCrawl(resolve, reject) {
for (let i = 0; i < depth; i++) {
nextLinks = yield crawl();
nextLinks = yield singleDepthCrawl();
if (nextLinks.err) {
reject(nextLinks.err);
break;
Expand Down Expand Up @@ -95,7 +82,7 @@ function createCrawler(config = {}) {
*/
function CrawlAllUrl() {
return new Promise((resolve, reject) => {
gen = crawlGen(resolve, reject);
gen = depthCrawl(resolve, reject);
gen.next();
return gen
}).then(() => finalFn())
Expand All @@ -107,4 +94,14 @@ function createCrawler(config = {}) {
}

}
module.exports = createCrawler;
module.exports = createCrawler;

//
function defaultDepthFn(data) {
// console.log("---depthFn called---")
}

function defaultFinalFn(err) {
if (err) throw err
return
}
2 changes: 1 addition & 1 deletion module/crawlUrls.js
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ module.exports = function () {

for (let url of urls) {
const visitedUrlString = genUniqueVisitedString(url)
if (visitedLinks.indexOf(visitedUrlString) === -1) { //Todo: improve the visitedLinks check
if (visitedLinks.indexOf(visitedUrlString) === -1) { //TODO: improve the visitedLinks check
visitedUrls++;
if (rateLimit) {
await new Promise((resolve, reject) => setTimeout(args => {
Expand Down

0 comments on commit 5df2933

Please sign in to comment.