Skip to content

Commit

Permalink
test url
Browse files Browse the repository at this point in the history
  • Loading branch information
dlutwuwei committed Nov 24, 2016
1 parent cf62780 commit 2ae3f40
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 3 deletions.
25 changes: 23 additions & 2 deletions lib/crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ exports.request = function (url, options, strategy) {

//spider function to crawel urls from website
$.spider = function () {

$('a').each(function (index, value) {
var url = $(this).attr('href');
if (url) url = url.replace("./", "/");
Expand All @@ -89,7 +88,9 @@ exports.request = function (url, options, strategy) {
if (!/^https?:/.test(url)) {
url = host + (url[0] == '/' ? url.substr(1, url.length) : url);
}

if(!testUrl(url)) {
return;
}
var route = router.match(url);
if (route != null) {
route.fn.apply(null, [err, $, body, url, resp]);
Expand Down Expand Up @@ -139,5 +140,25 @@ exports.ignore = function (pattern) {
});
}

function testUrl(str_url){
var strRegex = "^((https|http|ftp|rtsp|mms)?://)"
"?(([0-9a-z_!~*'().= $%-] : )?[0-9a-z_!~*'().= $%-] @)?" //ftp的user@
"(([0-9]{1,3}\.){3}[0-9]{1,3}" // IP形式的URL- 199.194.52.184
"|" // 允许IP和DOMAIN(域名)
"([0-9a-z_!~*'()-] \.)*" // 域名- www.
"([0-9a-z][0-9a-z-]{0,61})?[0-9a-z]\." // 二级域名
"[a-z]{2,6})" // first level domain- .com or .museum
"(:[0-9]{1,4})?" // 端口- :80
"((/?)|" // a slash isn't required if there is no file name
"(/[0-9a-z_!~*'().;?:@= $,%#-] ) /?)$";
var re=new RegExp(strRegex);
//re.test()
if (re.test(str_url)){
return (true);
}else{
return (false);
}
}



2 changes: 1 addition & 1 deletion test/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ crawler.addStrategy('*',function(err, $, body, url, resp){
//console.log(body);
console.log($("img").attr('src'));
});
crawler.request("http://c6.d5j.biz/htm_data/16/1611/2135730.html",{decode:"utf-8",deep:3,concurrency:3}, function(err, $, body,resp){
crawler.request("http://baidu.com/",{decode:"utf-8",deep:3,concurrency:3}, function(err, $, body,resp){
$.spider();
});

0 comments on commit 2ae3f40

Please sign in to comment.