Skip to content

Commit

Permalink
add skipDulplicates and made minor changes
Browse files Browse the repository at this point in the history
  • Loading branch information
kayslay committed Jun 9, 2019
1 parent aceec21 commit 0843b49
Showing 1 changed file with 47 additions and 69 deletions.
116 changes: 47 additions & 69 deletions module/crawlUrls.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,10 @@ module.exports = function () {
//The configuration variables. They would be set by the initCrawl function
let fetchSelector, fetchSelectBy, nextSelector, nextSelectBy, formatUrl, timeOut = false,
groups, _groupSet = {},
skipDuplicates = true,
rateLimit,
//set all defaultDynamicSchemas props when the variable reference is undefined
defaultDynamicSchemas = {
fetchSelector: undefined,
fetchSelectBy: undefined,
nextSelector: undefined,
nextSelectBy: undefined
},
//set all _dynamicSchemas props when the variable reference is undefined
_dynamicSchemas = {},
fetchFn, nextFn;

/**
Expand All @@ -47,15 +43,35 @@ module.exports = function () {

for (let url of urls) {
const visitedUrlString = genUniqueVisitedString(url)
if (visitedLinks.indexOf(visitedUrlString) === -1) { //TODO: improve the visitedLinks check
if (!skipDuplicates || visitedLinks.indexOf(visitedUrlString) === -1) {
visitedUrls++;
if (rateLimit) {
if (rateLimit) { //checks if ratelimit is set, and wait for the rateLimit before continuing
await new Promise((resolve, reject) => setTimeout(args => {
resolve(null)
}, rateLimit))
}
visitedLinks.push(visitedUrlString);
req(url);
// make request
request(url, function (err, response, body) {
visitedUrls--;
if (err) {
console.error(`${(new Date())} ERROR ${err.message}`);
} else {
getDomContents = dom(body).getDomContents; //
scrapedData.push(fetchFromPage(url));
let newLink = _.uniq(util.sortDataToArray([selectNextCrawlContent(url)]).map(url => {
return formatUrl(url)
}));
initialLink = initialLink.concat(newLink);
}

if (visitedUrls == 0) {
resolve({
fetchedData: scrapedData,
nextLinks: initialLink
})
}
});
} else {
console.info(`${(new Date())} INFO ${visitedUrlString} has been visited`)
}
Expand All @@ -65,41 +81,6 @@ module.exports = function () {
reject(new AllLinksVisitErr())
}

/**
* @description decrement the visitedLink count, makes request to the url passed to it
* get the response of the request and extract the data needed from the response body.
* the data extracted is appended to scrapedData. when the visitedLink count is 0 the promise is resolved
* @param url
* @private
*/

function req(url) {

request(url, function (err, response, body) {
visitedUrls--;
if (err) {
//todo: context kill
console.error(`${(new Date())} ERROR ${err.message}`);
} else {
//Todo: context kill
getDomContents = dom(body).getDomContents; //
scrapedData.push(fetchFromPage(url));
let newLink = _.uniq(util.sortDataToArray([selectNextCrawlContent(url)]).map(url => {
return formatUrl(url)
}));
initialLink = initialLink.concat(newLink);
}

if (visitedUrls == 0) {
resolve({
fetchedData: scrapedData,
nextLinks: initialLink
})
}
});

}

}


Expand All @@ -108,8 +89,8 @@ module.exports = function () {
* @param {String|Object} url
*/
function fetchFromPage(url) {
let selector = util.dynamicSelection(url, defaultDynamicSchemas.fetchSelector, fetchSelector);
let selectBy = util.dynamicSelection(url, defaultDynamicSchemas.fetchSelectBy, fetchSelectBy);
let selector = util.dynamicSelection(url, _dynamicSchemas.fetchSelector, fetchSelector);
let selectBy = util.dynamicSelection(url, _dynamicSchemas.fetchSelectBy, fetchSelectBy);

return getDomContents(selector, selectBy, fetchFn, url, {
_groupSet,
Expand All @@ -121,8 +102,8 @@ module.exports = function () {
* @param {String|Object} url
*/
function selectNextCrawlContent(url) {
let selector = util.dynamicSelection(url, defaultDynamicSchemas.nextSelector, nextSelector);
let selectBy = util.dynamicSelection(url, defaultDynamicSchemas.nextSelectBy, nextSelectBy);
let selector = util.dynamicSelection(url, _dynamicSchemas.nextSelector, nextSelector);
let selectBy = util.dynamicSelection(url, _dynamicSchemas.nextSelectBy, nextSelectBy);
return getDomContents(selector, selectBy, nextFn, url);
}

Expand Down Expand Up @@ -156,29 +137,26 @@ module.exports = function () {
* @param config
* @return {Promise}
*/
function initCrawl(urls, config) {
function initCrawl(urls, config = {}) {

if (!configured) {
let dynamicSchemas; // define the variable to hold the dynamic data
(function (config = {}) {

({
fetchSelector,
fetchSelectBy,
nextSelector = {},
nextSelectBy = {},
fetchFn,
nextFn,
timeOut,
groups = {},
rateLimit = 0,
dynamicSchemas = {},
formatUrl = util.formatUrl
} = config);
configSelectors()
Object.assign(defaultDynamicSchemas, dynamicSchemas)

})(config);
({
fetchSelector,
fetchSelectBy,
nextSelector = {},
nextSelectBy = {},
fetchFn,
nextFn,
timeOut,
groups = {},
rateLimit = 0,
dynamicSchemas = {},
formatUrl = util.formatUrl,
skipDuplicates
} = config);
configSelectors()
Object.assign(_dynamicSchemas, dynamicSchemas)
configured = true
}

Expand Down

0 comments on commit 0843b49

Please sign in to comment.