Skip to content

Commit

Permalink
code review: convert static filtering's tokenizer to a global utility
Browse files Browse the repository at this point in the history
  • Loading branch information
gorhill committed Dec 29, 2015
1 parent 9d06ae1 commit a944873
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 98 deletions.
27 changes: 15 additions & 12 deletions .jshintrc
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
{
"browser": true,
"devel": true,
"eqeqeq": true,
"esnext": true,
"globals": {
"chrome": false,
"Components": false, // global variable in Firefox
"safari": false,
"self": false,
"vAPI": false,
"µBlock": false
},
"globalstrict": true,
"undef": true,
"unused": true,
"laxbreak": true,
"newcap": false,
"nonew": false,
"strict": "global",
"sub": true,
"laxbreak": true,
"undef": true,
"unused": true,
"validthis": true,
"newcap": false,
"-W058": true, // suppress "Missing '()' invoking a constructor" message
"globals": {
"self": false,
"vAPI": false,
"chrome": false,
"safari": false,
"Components": false // global variable in Firefox
}
"-W058": true // suppress "Missing '()' invoking a constructor" message
}
82 changes: 14 additions & 68 deletions src/js/static-net-filtering.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
Home: https://github.com/gorhill/uBlock
*/

/* jshint bitwise: false, esnext: true */
/* global punycode, µBlock */
/* jshint bitwise: false */
/* global punycode */

/******************************************************************************/

Expand Down Expand Up @@ -1657,19 +1657,10 @@ FilterParser.prototype.makeToken = function() {
/******************************************************************************/
/******************************************************************************/

var TokenEntry = function() {
this.beg = 0;
this.token = '';
};

/******************************************************************************/
/******************************************************************************/

var FilterContainer = function() {
this.reAnyToken = /[%0-9a-z]+/g;
this.reIsGeneric = /[\^\*]/;
this.tokens = [];
this.filterParser = new FilterParser();
this.urlTokenizer = µb.urlTokenizer;
this.reset();
};

Expand Down Expand Up @@ -2254,42 +2245,6 @@ FilterContainer.prototype.filterRegexFromCompiled = function(compiled, flags) {

/******************************************************************************/

// Since the addition of the `important` evaluation, this means it is now
// likely that the url will have to be scanned more than once. So this is
// to ensure we do it once only, and reuse results.

FilterContainer.prototype.tokenize = function(url) {
var tokens = this.tokens;
var re = this.reAnyToken;
var matches, tokenEntry;
re.lastIndex = 0;
var i = 0;
while ( (matches = re.exec(url)) ) {
tokenEntry = tokens[i];
if ( tokenEntry === undefined ) {
tokenEntry = tokens[i] = new TokenEntry();
}
tokenEntry.beg = matches.index;
tokenEntry.token = matches[0];
i += 1;

// https://github.com/chrisaljoudi/uBlock/issues/1118
// Crazy case... but I guess we have to expect the worst...
if ( i === 2048 ) {
break;
}
}

// Sentinel
tokenEntry = tokens[i];
if ( tokenEntry === undefined ) {
tokenEntry = tokens[i] = new TokenEntry();
}
tokenEntry.token = '';
};

/******************************************************************************/

FilterContainer.prototype.matchTokens = function(bucket, url) {
// Hostname-only filters
var f = bucket['.'];
Expand All @@ -2299,7 +2254,7 @@ FilterContainer.prototype.matchTokens = function(bucket, url) {
return true;
}

var tokens = this.tokens;
var tokens = this.urlTokenizer.getTokens();
var tokenEntry, token;
var i = 0;
for (;;) {
Expand Down Expand Up @@ -2336,22 +2291,20 @@ FilterContainer.prototype.matchTokens = function(bucket, url) {
// not the generic handling.

FilterContainer.prototype.matchStringExactType = function(context, requestURL, requestType) {
var url = requestURL.toLowerCase();

// These registers will be used by various filters
pageHostnameRegister = context.pageHostname || '';
requestHostnameRegister = µb.URI.hostnameFromURI(requestURL);

var party = isFirstParty(context.pageDomain, requestHostnameRegister) ? FirstParty : ThirdParty;

// Be prepared to support unknown types
var type = typeNameToTypeValue[requestType] || 0;
if ( type === 0 ) {
return undefined;
}

// Tokenize only once
this.tokenize(url);
// Prime tokenizer: we get a normalized URL in return.
var url = this.urlTokenizer.setURL(requestURL);

// These registers will be used by various filters
pageHostnameRegister = context.pageHostname || '';
requestHostnameRegister = µb.URI.hostnameFromURI(url);

var party = isFirstParty(context.pageDomain, requestHostnameRegister) ? FirstParty : ThirdParty;

this.fRegister = null;

Expand Down Expand Up @@ -2426,12 +2379,6 @@ FilterContainer.prototype.matchString = function(context) {
return this.matchStringExactType(context, context.requestURL, context.requestType);
}

// https://github.com/gorhill/httpswitchboard/issues/239
// Convert url to lower case:
// `match-case` option not supported, but then, I saw only one
// occurrence of it in all the supported lists (bulgaria list).
var url = context.requestURL.toLowerCase();

// The logic here is simple:
//
// block = !whitelisted && blacklisted
Expand All @@ -2453,14 +2400,13 @@ FilterContainer.prototype.matchString = function(context) {
// filters are tested *only* if there is a (unlikely) hit on a block
// filter.

// Prime tokenizer: we get a normalized URL in return.
var url = this.urlTokenizer.setURL(context.requestURL);

// These registers will be used by various filters
pageHostnameRegister = context.pageHostname || '';
requestHostnameRegister = context.requestHostname;

// Tokenize only once
this.tokenize(url);

this.fRegister = null;

var party = isFirstParty(context.pageDomain, context.requestHostname) ? FirstParty : ThirdParty;
Expand Down
2 changes: 1 addition & 1 deletion src/js/tab.js
Original file line number Diff line number Diff line change
Expand Up @@ -769,7 +769,7 @@ vAPI.tabs.registerListeners();
if ( pageStore !== null ) {
state = pageStore.getNetFilteringSwitch();
if ( state && this.userSettings.showIconBadge && pageStore.perLoadBlockedRequestCount ) {
badge = this.utils.formatCount(pageStore.perLoadBlockedRequestCount);
badge = this.formatCount(pageStore.perLoadBlockedRequestCount);
}
}

Expand Down
91 changes: 74 additions & 17 deletions src/js/utils.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*******************************************************************************
µBlock - a browser extension to block requests.
Copyright (C) 2014 Raymond Hill
uBlock Origin - a browser extension to block requests.
Copyright (C) 2014-2015 Raymond Hill
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand All @@ -19,22 +19,87 @@
Home: https://github.com/gorhill/uBlock
*/

/* global µBlock */
'use strict';

/******************************************************************************/

// This will inserted as a module in the µBlock object.

µBlock.utils = (function() {
// A standalone URL tokenizer will allow us to use URL tokens in more than
// just static filtering engine. This opens the door to optimize other
// filtering engine parts aside static filtering. This also allows:
// - Tokenize only on demand.
// - To potentially avoid tokenizing when same URL is fed to tokenizer.
// - Benchmarking shows this to be a common occurrence.

µBlock.urlTokenizer = {
setURL: function(url) {
if ( url !== this._urlIn ) {
this._urlIn = url;
this._urlOut = url.toLowerCase();
this._tokenized = false;
}
return this._urlOut;
},

// Tokenize on demand.
getTokens: function() {
if ( this._tokenized === false ) {
this._tokenize();
this._tokenized = true;
}
return this._tokens;
},

isTokenized: function() {
return this._tokens !== null && this._tokens[0].token !== '';
},

_Entry: function() {
this.beg = 0;
this.token = '';
},

// https://github.com/chrisaljoudi/uBlock/issues/1118
// We limit to a maximum number of tokens.
_init: function() {
this._tokens = new Array(2048);
for ( var i = 0; i < 2048; i++ ) {
this._tokens[i] = new this._Entry();
}

/******************************************************************************/
this._init = null;
},

_tokenize: function() {
var tokens = this._tokens,
re = this._reAnyToken,
url = this._urlOut;
var matches, entry;
re.lastIndex = 0;

for ( var i = 0; i < 2047; i++ ) {
matches = re.exec(url);
if ( matches === null ) {
break;
}
entry = tokens[i];
entry.beg = matches.index;
entry.token = matches[0];
}
tokens[i].token = ''; // Sentinel
},

_urlIn: '',
_urlOut: '',
_tokenized: false,
_tokens: null,
_reAnyToken: /[%0-9a-z]+/g
};

var exports = {};
µBlock.urlTokenizer._init();

/******************************************************************************/

exports.formatCount = function(count) {
µBlock.formatCount = function(count) {
if ( typeof count !== 'number' ) {
return '';
}
Expand All @@ -58,11 +123,3 @@ exports.formatCount = function(count) {
// https://www.youtube.com/watch?v=DyvzfyqYm_s

/******************************************************************************/

return exports;

/******************************************************************************/

})();

/******************************************************************************/

0 comments on commit a944873

Please sign in to comment.