Skip to content

Commit

Permalink
Updated jQuery to 1.8.3, fix+test always absolute a.href
Browse files Browse the repository at this point in the history
  • Loading branch information
sylvinus committed Dec 8, 2012
1 parent 8821db2 commit 2c45fb4
Show file tree
Hide file tree
Showing 11 changed files with 146 additions and 23 deletions.
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ Crash course
"callback":function(error,result,$) {

// $ is a jQuery instance scoped to the server-side DOM of the page
$("#content a:link").each(function(a) {
$("#content a").each(function(a) {
c.queue(a.href);
});
}
Expand Down Expand Up @@ -132,6 +132,13 @@ Rough todolist
ChangeLog
---------

0.2.2
- Fix relative link bug, all a.href should be absolute when crawling a remote URL
- Updated default jQuery to 1.8.3

0.2.1
- Updated jsdom to 0.2.19

0.2.0
- Updated code & dependencies for node 0.6/0.8, cleaned package.json
- Added a forceUTF8 mode
Expand Down
63 changes: 45 additions & 18 deletions lib/crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ var http = require('http'),
jschardet = require('jschardet'),
Iconv = require('iconv').Iconv,
jsdom = require('jsdom'),
fs = require("fs"),
Pool = require('generic-pool').Pool;


Expand All @@ -18,10 +19,10 @@ exports.Crawler = function(options) {
self.options = _.extend({
timeout: 60000,
jQuery: true,
jQueryUrl: path.resolve(__dirname,"../vendor/jquery-1.8.1.min.js"),
jQueryUrl: path.resolve(__dirname,"../vendor/jquery-1.8.3.min.js"),
maxConnections: 10,
priorityRange: 10,
priority: 5,
priority: 5,
retries: 3,
forceUTF8: false,
retryTimeout: 10000,
Expand Down Expand Up @@ -65,7 +66,7 @@ exports.Crawler = function(options) {
if (queuedCount+plannedQueueCallsCount === 0) {
if (self.options.onDrain) self.options.onDrain();
}
}
};

self.onDrain = function() {};

Expand All @@ -86,7 +87,7 @@ exports.Crawler = function(options) {
//If a query has already been made to self URL, don't callback again
if (cacheData) {

// Make sure we actually have cached data, and not just a note
// Make sure we actually have cached data, and not just a note
// that the page was already crawled
if (_.isArray(cacheData)) {
self.onContent(null,opts,cacheData[0],true);
Expand Down Expand Up @@ -179,7 +180,7 @@ exports.Crawler = function(options) {
self.cache[toQueue.uri] = [response];

//If we don't cache but still want to skip duplicates we have to maintain a list of fetched URLs.
} else if (toQueue.skipDuplicates) {
} else if (toQueue.skipDuplicates) {
self.cache[toQueue.uri] = true;
}
}
Expand All @@ -190,19 +191,45 @@ exports.Crawler = function(options) {

if (toQueue.jQuery && toQueue.method!="HEAD") {

// TODO support for non-HTML content
// TODO support for non-HTML content
// https://github.com/joshfire/node-crawler/issues/9
try {
jsdom.env(response.body,[toQueue.jQueryUrl],function(errors,window) {
if (errors) {
toQueue.callback(errors);
} else {
response.window = window;
toQueue.callback(null,response,window.jQuery);
}

release(toQueue);
});
var jsd = function(src) {
jsdom.env({
"url":toQueue.uri,
"html":response.body,
"src":src,
"done":function(errors,window) {

if (errors) {
toQueue.callback(errors);
} else {
response.window = window;
toQueue.callback(null,response,window.jQuery);
}

release(toQueue);
}
});
};

// jsdom doesn't support adding local scripts,
// We have to read jQuery from the local fs
if (toQueue.jQueryUrl.match(/^(file\:\/\/|\/)/)) {

// TODO cache this
fs.readFile(toQueue.jQueryUrl.replace(/^file\:\/\//,""),"utf-8",function(err,jq) {
if (err) {
toQueue.callback(e);
release(toQueue);
} else {
jsd([jq]);
}
});
} else {
jsd([toQueue.jQueryUrl]);
}

} catch (e) {
toQueue.callback(e);
release(toQueue);
Expand All @@ -212,7 +239,7 @@ exports.Crawler = function(options) {

toQueue.callback(null,response);
release(toQueue);
}
}

};

Expand Down Expand Up @@ -280,7 +307,7 @@ exports.Crawler = function(options) {
}

},toQueue.priority);
}
};

};

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "crawler",
"version": "0.2.1",
"version": "0.2.2",
"description": "Crawler is a web spider written with Nodejs. It gives you the full power of jQuery on the server to parse a big number of pages as they are downloaded, asynchronously.",
"keywords": [
"dom",
Expand Down
7 changes: 7 additions & 0 deletions test/mockfiles/links1.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<html>
<body>
<a href="links2.html">Relative link</a>

<a href="/mockfiles/links2.html">Absolute link</a>
</body>
</html>
5 changes: 5 additions & 0 deletions test/mockfiles/links2.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<html>
<body>
<a href="links1.html">Relative link</a>
</body>
</html>
12 changes: 11 additions & 1 deletion test/mockserver.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,14 @@ app.get('/close/destroy', function(req, res){
res.end();
});

exports.app = app;

app.get('/mockfiles/*', function(req, res){
res.sendfile("test/mockfiles/"+req.param(0));
});


exports.app = app;

if (require.main === module) {
app.listen(8080);
}
23 changes: 23 additions & 0 deletions test/npm-debug.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
0 info it worked if it ends with ok
1 verbose cli [ 'node', '/usr/local/bin/npm', 'test' ]
2 info using [email protected]
3 info using [email protected]
4 verbose node symlink /usr/local/bin/node
5 verbose config file /Users/sylvinus/.npmrc
6 verbose config file /usr/local/etc/npmrc
7 verbose config file /usr/local/lib/node_modules/npm/npmrc
8 verbose read json /Users/sylvinus/w/sz/node-crawler/test/package.json
9 error Error: ENOENT, open '/Users/sylvinus/w/sz/node-crawler/test/package.json'
10 error If you need help, you may report this log at:
10 error <http://github.com/isaacs/npm/issues>
10 error or email it to:
10 error <[email protected]>
11 error System Darwin 12.2.0
12 error command "node" "/usr/local/bin/npm" "test"
13 error cwd /Users/sylvinus/w/sz/node-crawler/test
14 error node -v v0.8.4
15 error npm -v 1.1.45
16 error path /Users/sylvinus/w/sz/node-crawler/test/package.json
17 error code ENOENT
18 error errno 34
19 verbose exit [ 34, true ]
3 changes: 3 additions & 0 deletions test/testrunner.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,12 @@ testrunner.run([
{
code: path + "/lib/crawler.js",
tests: [
path + "/test/units/links.js",

path + "/test/units/forceutf8.js",
path + "/test/units/simple.js",
path + "/test/units/errors.js"

]
}
],function() {
Expand Down
41 changes: 41 additions & 0 deletions test/units/links.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
var Crawler = require("../../lib/crawler").Crawler;
var _ = require("underscore");

QUnit.module("links");

var DEBUG = true;
var MOCKPORT = 30045;


test("links resolve to absolute urls", function() {
expect( 2 );

stop();

var c = new Crawler({
"debug":DEBUG,
"timeout":500,
"retryTimeout":1000,
"retries":1,
"onDrain":function() {
start();
}
});

c.queue([{
"uri":"http://127.0.0.1:"+MOCKPORT+"/mockfiles/links1.html",
"callback":function(error,result,$) {

var links = _.map($("a"),function(a) {
return a.href;
});

//Both links should be resolve to absolute URLs
equal(links[0],"http://127.0.0.1:30045/mockfiles/links2.html");
equal(links[1],"http://127.0.0.1:30045/mockfiles/links2.html");

}
}]);


});
2 changes: 0 additions & 2 deletions vendor/jquery-1.8.1.min.js

This file was deleted.

2 changes: 2 additions & 0 deletions vendor/jquery-1.8.3.min.js

Large diffs are not rendered by default.

0 comments on commit 2c45fb4

Please sign in to comment.