Skip to content

Commit

Permalink
Commit various optimizations to the Lexer, and add stub file for prof…
Browse files Browse the repository at this point in the history
…iling the lexer.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@92 48356398-32a2-884e-a903-53898d9a118a
  • Loading branch information
Edward Z. Yang committed Jul 22, 2006
1 parent de5ab5e commit ca1aefe
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 91 deletions.
14 changes: 14 additions & 0 deletions benchmarks/ProfileDirectLex.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<?php

set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');

require_once 'HTMLPurifier/Lexer/DirectLex.php';

$input = file_get_contents('samples/Lexer/4.html');
$lexer = new HTMLPurifier_Lexer_DirectLex();

for ($i = 0; $i < 10; $i++) {
$tokens = $lexer->tokenizeHTML($input);
}

?>
2 changes: 1 addition & 1 deletion library/HTMLPurifier/Lexer.php
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<?php

/**
* Forgivingly lexes SGML style documents: HTML, XML, XHTML, etc.
* Forgivingly lexes HTML (not XML, since it doesn't adhere to spec exactly)
*/

require_once 'HTMLPurifier/Token.php';
Expand Down
174 changes: 101 additions & 73 deletions library/HTMLPurifier/Lexer/DirectLex.php
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,14 @@ function tokenizeHTML($string) {
$segment, $position_first_space
)
);
$attributes = $this->tokenizeAttributeString($attribute_string);
if ($attribute_string) {
$attributes = $this->tokenizeAttributeString(
$attribute_string
);
} else {
$attributes = array();
}

if ($is_self_closing) {
$array[] = new HTMLPurifier_Token_Empty($type, $attributes);
} else {
Expand All @@ -216,13 +223,47 @@ function tokenizeHTML($string) {
}

function tokenizeAttributeString($string) {
$string = (string) $string;
if ($string == '') return array();
$array = array();
$cursor = 0;
$in_value = false;
$i = 0;
$size = strlen($string);
$string = (string) $string; // quick typecast

if ($string == '') return array(); // no attributes

// let's see if we can abort as quickly as possible
// one equal sign, no spaces => one attribute
$num_equal = substr_count($string, '=');
$has_space = strpos($string, ' ');
if ($num_equal === 0 && !$has_space) {
// bool attribute
return array($string => $string);
} elseif ($num_equal === 1 && !$has_space) {
// only one attribute
list($key, $quoted_value) = explode('=', $string);
$quoted_value = trim($quoted_value);
if (!$key) return array();
if (!$quoted_value) return array($key => '');
$first_char = @$quoted_value[0];
$last_char = @$quoted_value[strlen($quoted_value)-1];

$same_quote = ($first_char == $last_char);
$open_quote = ($first_char == '"' || $first_char == "'");

if ( $same_quote && $open_quote) {
// well behaved
$value = substr($quoted_value, 1, strlen($quoted_value) - 2);
} else {
// not well behaved
if ($open_quote) {
$value = substr($quoted_value, 1);
} else {
$value = $quoted_value;
}
}
return array($key => $value);
}

// setup loop environment
$array = array(); // return assoc array of attributes
$cursor = 0; // current position in string (moves forward)
$size = strlen($string); // size of the string (stays the same)

// if we have unquoted attributes, the parser expects a terminating
// space, so let's guarantee that there's always a terminating space.
Expand All @@ -234,88 +275,75 @@ function tokenizeAttributeString($string) {
while(true) {

// infinite loop protection
// if we've looped 1000 times, abort. Nothing good can come of this
if (++$loops > 1000) return array();

if ($cursor >= $size) {
break;
}

$cursor += ($value = strspn($string, "\x20\x09\x0D\x0A", $cursor));

$position_next_space = $this->nextWhiteSpace($string, $cursor);
//scroll to the last whitespace before text
while ($position_next_space === $cursor) {
$cursor++;
$position_next_space = $this->nextWhiteSpace($string, $cursor);
}
$position_next_equal = strpos($string, '=', $cursor);
if ($position_next_equal !== false &&
($position_next_equal < $position_next_space ||
$position_next_space === false)) {
//attr="asdf"
// grab the key
$key = trim(
substr(
$string, $cursor, $position_next_equal - $cursor
)
);

// set cursor right after the equal sign
$cursor = $position_next_equal + 1;

// grab the key

$key_begin = $cursor; //we're currently at the start of the key

// scroll past all characters that are the key (not whitespace or =)
$cursor += strcspn($string, "\x20\x09\x0D\x0A=", $cursor);

$key_end = $cursor; // now at the end of the key

$key = substr($string, $key_begin, $key_end - $key_begin);

if (!$key) continue; // empty key

// scroll past all whitespace
$cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);

if ($cursor >= $size) {
$array[$key] = $key;
break;
}

// if the next character is an equal sign, we've got a regular
// pair, otherwise, it's a bool attribute
$first_char = @$string[$cursor];

if ($first_char == '=') {
// key="value"

// consume all spaces after the equal sign
$position_next_space = $this->nextWhiteSpace($string, $cursor);
while ($position_next_space === $cursor) {
$cursor++;
$position_next_space=$this->nextWhiteSpace($string,$cursor);
}
$cursor++;
$cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);

// if we've hit the end, assign the key an empty value and abort
if ($cursor >= $size) {
$array[$key] = '';
break;
}
// we might be in front of a quote right now

// find the next quote
$position_next_quote = $this->nextQuote($string, $cursor);
$char = @$string[$cursor];

// if the quote is not where the cursor is, we're dealing
// with an unquoted attribute
if ($position_next_quote !== $cursor) {
if ($key) {
$array[$key] = trim(substr($string, $cursor,
$position_next_space - $cursor));
}
$cursor = $position_next_space + 1;
continue;
if ($char == '"' || $char == "'") {
// it's quoted, end bound is $char
$cursor++;
$value_begin = $cursor;
$cursor = strpos($string, $char, $cursor);
$value_end = $cursor;
} else {
// it's not quoted, end bound is whitespace
$value_begin = $cursor;
$cursor += strcspn($string, "\x20\x09\x0D\x0A", $cursor);
$value_end = $cursor;
}

// otherwise, regular attribute
$quote = $string{$position_next_quote};
$position_end_quote = strpos(
$string, $quote, $position_next_quote + 1
);

// check if the ending quote is missing
if ($position_end_quote === false) {
// it is, assign it to the end of the string
$position_end_quote = $size;
}
$value = substr($string, $value_begin, $value_end - $value_begin);
$array[$key] = $value;
$cursor++;

$value = substr($string, $position_next_quote + 1,
$position_end_quote - $position_next_quote - 1);
if ($key) {
$array[$key] = html_entity_decode($value, ENT_QUOTES);
}
$cursor = $position_end_quote + 1;
} else {
//boolattr
if ($position_next_space === false) {
$position_next_space = $size;
}
$key = substr($string, $cursor, $position_next_space - $cursor);
if ($key) {
// boolattr
if ($key !== '') {
$array[$key] = $key;
}
$cursor = $position_next_space + 1;

}
}
return $array;
Expand Down
22 changes: 6 additions & 16 deletions library/HTMLPurifier/Token.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,36 +8,26 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
{
var $is_tag = true;
var $name;
function HTMLPurifier_Token_Tag($name) {
// watch out, actually XML is case-sensitive, while HTML
// is case insensitive, which means we can't use this for XML
$this->name = strtolower($name); // for some reason, the SAX parser
// uses uppercase. Investigate?
}
}

// a rich tag has attributes
class HTMLPurifier_Token_RichTag extends HTMLPurifier_Token_Tag // abstract
{
var $attributes = array();
function HTMLPurifier_Token_RichTag($name, $attributes = array()) {
$this->HTMLPurifier_Token_Tag($name);
function HTMLPurifier_Token_Tag($name, $attributes = array()) {
$this->name = ctype_lower($name) ? $name : strtolower($name);
$this->attributes = $attributes;
}
}

// start CONCRETE ones

class HTMLPurifier_Token_Start extends HTMLPurifier_Token_RichTag
class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
{
var $type = 'start';
}

class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_RichTag
class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
{
var $type = 'empty';
}

// accepts attributes even though it really can't, for optimization reasons
class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
{
var $type = 'end';
Expand All @@ -51,7 +41,7 @@ class HTMLPurifier_Token_Text extends HTMLPurifier_Token
var $is_whitespace = false;
function HTMLPurifier_Token_Text($data) {
$this->data = $data;
if (trim($data, " \n\r\t") === '') $this->is_whitespace = true;
if (ctype_space($data)) $this->is_whitespace = true;
}
function append($text) {
return new HTMLPurifier_Token_Text($this->data . $text->data);
Expand Down
19 changes: 18 additions & 1 deletion tests/HTMLPurifier/LexerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -153,13 +153,18 @@ function test_tokenizeHTML() {

// [SGML-INVALID]
$input[10] = '<a "=>';
// We barf on this, aim for no attributes
$expect[10] = array(
new HTMLPurifier_Token_Start('a', array('"' => ''))
);
// DOM doesn't register an invalid attribute
// DOM correctly has no attributes, but also closes the tag
$dom_expect[10] = array(
new HTMLPurifier_Token_Empty('a')
);
// SAX barfs on this
$sax_expect[10] = array(
new HTMLPurifier_Token_Start('a', array('"' => ''))
);

// [INVALID] [RECOVERABLE]
$input[11] = '"';
Expand Down Expand Up @@ -232,6 +237,18 @@ function test_tokenizeAttributeString() {
$input[6] = 'href="foo';
$expect[6] = array('href' => 'foo');

$input[7] = '"=';
$expect[7] = array('"' => '');
// 0123456789012345678901234567890123
$input[8] = 'href ="about:blank"rel ="nofollow"';
$expect[8] = array('href' => 'about:blank', 'rel' => 'nofollow');

$input[9] = 'foo bar';
$expect[9] = array('foo' => 'foo', 'bar' => 'bar');

$input[10] = 'foo="bar" blue';
$expect[10] = array('foo' => 'bar', 'blue' => 'blue');

$size = count($input);
for($i = 0; $i < $size; $i++) {
$result = $this->DirectLex->tokenizeAttributeString($input[$i]);
Expand Down

0 comments on commit ca1aefe

Please sign in to comment.