Skip to content

Commit

Permalink
token limit of 50k per rule and silent fail
Browse files Browse the repository at this point in the history
  • Loading branch information
AndiDittrich committed Nov 29, 2020
1 parent 7df6d6d commit 4419718
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 59 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
### Version 3.5.0 ###

* Changed: `droide` theme color `#009999` to `#007f7f` for higher contrast (WCAG) - thanks to [aphelionz on GitHub](https://github.com/EnlighterJS/EnlighterJS/pull/117)
* Changed: in case of a tokenizer error, the tokenizer will silently fail (output to console) instead of throwing an error - code will still be displayed but related tokens are missing
* Bugfix: tokenizer loop limit was calculated in total instead of per-rule
* Bugfix: keywords of `generic` language requires a word-boundary before+after instead of a non word character - thanks to [Irwanda04 on GitHub](https://github.com/EnlighterJS/EnlighterJS/issues/129)
* Bugfix: allow any non-whitespace chars in sql column name literals
* Bugfix: added missing `string` keyword to `c#` type list
Expand Down
2 changes: 1 addition & 1 deletion dist/enlighterjs.min.js

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions src/browser/EnlighterJS.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import {getRawCodeFromElement} from '../engine/sourcecode-extractor';
import * as _codegroupMapper from '../engine/codegroup-mapper';
import * as _optionReader from '../engine/option-reader';
import * as _elementManager from '../engine/element-manager';
import * as _logger from '../lib/console';

// static properties
export const version = '[[VERSION]]';
Expand Down Expand Up @@ -72,8 +73,7 @@ export function enlight(elements, elementOptions={}){

// Global Error Handling (FATAL ERRORS)
}catch (err){
/* eslint no-console: 0 */
console.error('EnlighterJS Internal Error:', err);
_logger.error('EnlighterJS Internal Error:', err);
return false;
}
}
Expand Down
133 changes: 77 additions & 56 deletions src/engine/tokenizer.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,84 +7,105 @@
// ----------------------------------------------------------------------

import _token from './token';
import * as _logger from '../lib/console';

// Stage-1 Master Tokenizer
export function getTokens(code, rules, defaultTokenType = 'text'){
function findAllMatches(code, rule, priority){
// token list
let rawTokens = [];
const tokens = [];

// current match
let match;

// iteration counter
let iterationCounter = 0;

// apply each rule to given sourcecode string
for (let priority=0;priority<rules.length;priority++){
// extract current rule
const rule = rules[priority];
// find ALL possible matches
while ((match = rule.regex.exec(code)) != null){
// increment counter
iterationCounter++;

// valid rule ? otherwise skip it!
if (!rule || !rule.type || !rule.regex){
return;
// throw an error on > 50k tokens - seems to be a infinite loop which may crash the browser!
if (iterationCounter > 50000){
throw new Error('Infinite tokenizer loop detected; more than 50k tokens - language rule [' + priority + '] ' + rule.regex + ' seems to be broken');
}

let match;
// ignore empty matches
if (match[0].length == 0){
continue;
}

// find ALL possible matches
while ((match = rule.regex.exec(code)) != null){
// increment counter
iterationCounter++;
// overrides the usual regex behaviour of not matching results that overlap
// normally it should be only +1.
// to optimize the matching performance, we skip thrid of the result length and start the new matching
rule.regex.lastIndex = match.index + 1 + match[0].length/3;

// throw an error on > 50k tokens - seems to be a infinite loop which may crash the browser!
if (iterationCounter > 50000){
throw new Error('Infinite tokenizer loop detected; more than 50k tokens - language rule [' + priority + '] ' + rule.regex + ' seems to be broken');
}
// default type - first element
const defaultType = (Array.isArray(rule.type)) ? rule.type[0] : rule.type;

// ignore empty matches
if (match[0].length == 0){
continue;
}
// default filter - first element
const defaultFilter = ((Array.isArray(rule.filter)) ? rule.filter[0] : rule.filter) || null;

// overrides the usual regex behaviour of not matching results that overlap
// normally it should be only +1.
// to optimize the matching performance, we skip thrid of the result length and start the new matching
rule.regex.lastIndex = match.index + 1 + match[0].length/3;
// matching group used ?
if (match.length > 1){

// default type - first element
const defaultType = (Array.isArray(rule.type)) ? rule.type[0] : rule.type;
// match indexOf offset
let offset = 0;

// process each matching group as single token
for (let i=1;i<match.length;i++){

// default filter - first element
const defaultFilter = ((Array.isArray(rule.filter)) ? rule.filter[0] : rule.filter) || null;
// valid match ?
if (match[i]){
// is array ? get nth type
const type = (Array.isArray(rule.type) && rule.type.length >= i) ? rule.type[i-1] : defaultType;

// matching group used ?
if (match.length > 1){
// is array ? get nth type
const filter = (Array.isArray(rule.filter) && rule.filter.length >= i) ? rule.filter[i-1] : defaultFilter;

// match indexOf offset
let offset = 0;

// process each matching group as single token
for (let i=1;i<match.length;i++){
// get match index - avoid overlapping using offset
const matchPosition = match[0].indexOf(match[i], offset);

// valid match ?
if (match[i]){
// is array ? get nth type
const type = (Array.isArray(rule.type) && rule.type.length >= i) ? rule.type[i-1] : defaultType;
// set new offset
offset = matchPosition;

// is array ? get nth type
const filter = (Array.isArray(rule.filter) && rule.filter.length >= i) ? rule.filter[i-1] : defaultFilter;
// create new token
tokens.push(_token(match[i], type, filter, match.index + matchPosition, priority));
}
}
}else{
// use full pattern matching
tokens.push(_token(match[0], defaultType, defaultFilter, match.index, priority));
}
}

// get match index - avoid overlapping using offset
const matchPosition = match[0].indexOf(match[i], offset);
return tokens;
}

// set new offset
offset = matchPosition;
// Stage-1 Master Tokenizer
export function getTokens(code, rules, defaultTokenType = 'text'){
// token list
let rawTokens = [];

// create new token
rawTokens.push(_token(match[i], type, filter, match.index + matchPosition, priority));
}
}
}else{
// use full pattern matching
rawTokens.push(_token(match[0], defaultType, defaultFilter, match.index, priority));
}
// apply each rule to given sourcecode string
for (let priority=0;priority<rules.length;priority++){
// extract current rule
const rule = rules[priority];

// valid rule ? otherwise skip it!
if (!rule || !rule.type || !rule.regex){
return;
}

try{
// try to get all tokens
const tokens = findAllMatches(code, rule, priority);

// push tokens to list
rawTokens = rawTokens.concat(tokens);

// catch and ignore tokenizer errors
}catch(e){
_logger.error(e);
}
}

Expand Down
17 changes: 17 additions & 0 deletions src/lib/console.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// ----------------------------------------------------------------------
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
// --
// Copyright 2016-2020 Andi Dittrich <https://andidittrich.com>
// ----------------------------------------------------------------------

export function error(...msg){
const logger = console.error || console.log || function(){};
logger(...msg);
}

export function log(...msg){
const logger = console.log || function(){};
logger(...msg);
}

0 comments on commit 4419718

Please sign in to comment.