Commit various optimizations to the Lexer, and add stub file for prof…

…iling the lexer. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@92 48356398-32a2-884e-a903-53898d9a118a
fdicioccio · Jul 22, 2006 · ca1aefe · ca1aefe
1 parent de5ab5e
commit ca1aefe
Show file tree

Hide file tree

Showing 5 changed files with 140 additions and 91 deletions.
diff --git a/benchmarks/ProfileDirectLex.php b/benchmarks/ProfileDirectLex.php
@@ -0,0 +1,14 @@
+<?php
+
+set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
+
+require_once 'HTMLPurifier/Lexer/DirectLex.php';
+
+$input = file_get_contents('samples/Lexer/4.html');
+$lexer = new HTMLPurifier_Lexer_DirectLex();
+
+for ($i = 0; $i < 10; $i++) {
+    $tokens = $lexer->tokenizeHTML($input);
+}
+
+?>
diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php
@@ -1,7 +1,7 @@
 <?php
 
 /**
- * Forgivingly lexes SGML style documents: HTML, XML, XHTML, etc.
+ * Forgivingly lexes HTML (not XML, since it doesn't adhere to spec exactly)
  */
 
 require_once 'HTMLPurifier/Token.php';

diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -190,7 +190,14 @@ function tokenizeHTML($string) {
                             $segment, $position_first_space
                         )
                     );
-                $attributes = $this->tokenizeAttributeString($attribute_string);
+                if ($attribute_string) {
+                    $attributes = $this->tokenizeAttributeString(
+                                        $attribute_string
+                                  );
+                } else {
+                    $attributes = array();
+                }
+
                 if ($is_self_closing) {
                     $array[] = new HTMLPurifier_Token_Empty($type, $attributes);
                 } else {
@@ -216,13 +223,47 @@ function tokenizeHTML($string) {
     }
 
     function tokenizeAttributeString($string) {
-        $string = (string) $string;
-        if ($string == '') return array();
-        $array = array();
-        $cursor = 0;
-        $in_value = false;
-        $i = 0;
-        $size = strlen($string);
+        $string = (string) $string; // quick typecast
+
+        if ($string == '') return array(); // no attributes
+
+        // let's see if we can abort as quickly as possible
+        // one equal sign, no spaces => one attribute
+        $num_equal = substr_count($string, '=');
+        $has_space = strpos($string, ' ');
+        if ($num_equal === 0 && !$has_space) {
+            // bool attribute
+            return array($string => $string);
+        } elseif ($num_equal === 1 && !$has_space) {
+            // only one attribute
+            list($key, $quoted_value) = explode('=', $string);
+            $quoted_value = trim($quoted_value);
+            if (!$key) return array();
+            if (!$quoted_value) return array($key => '');
+            $first_char = @$quoted_value[0];
+            $last_char  = @$quoted_value[strlen($quoted_value)-1];
+
+            $same_quote = ($first_char == $last_char);
+            $open_quote = ($first_char == '"' || $first_char == "'");
+
+            if ( $same_quote && $open_quote) {
+                // well behaved
+                $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
+            } else {
+                // not well behaved
+                if ($open_quote) {
+                    $value = substr($quoted_value, 1);
+                } else {
+                    $value = $quoted_value;
+                }
+            }
+            return array($key => $value);
+        }
+
+        // setup loop environment
+        $array  = array(); // return assoc array of attributes
+        $cursor = 0; // current position in string (moves forward)
+        $size   = strlen($string); // size of the string (stays the same)
 
         // if we have unquoted attributes, the parser expects a terminating
         // space, so let's guarantee that there's always a terminating space.
@@ -234,88 +275,75 @@ function tokenizeAttributeString($string) {
         while(true) {
 
             // infinite loop protection
-            // if we've looped 1000 times, abort. Nothing good can come of this 
             if (++$loops > 1000) return array();
 
             if ($cursor >= $size) {
                 break;
             }
+
+            $cursor += ($value = strspn($string, "\x20\x09\x0D\x0A", $cursor));
+
             $position_next_space = $this->nextWhiteSpace($string, $cursor);
-            //scroll to the last whitespace before text
-            while ($position_next_space === $cursor) {
-                $cursor++;
-                $position_next_space = $this->nextWhiteSpace($string, $cursor);
-            }
             $position_next_equal = strpos($string, '=', $cursor);
-            if ($position_next_equal !== false &&
-                 ($position_next_equal < $position_next_space ||
-                  $position_next_space === false)) {
-                //attr="asdf"
-                // grab the key
-                $key = trim(
-                    substr(
-                        $string, $cursor, $position_next_equal - $cursor
-                    )
-                );
-
-                // set cursor right after the equal sign
-                $cursor = $position_next_equal + 1;
+
+            // grab the key
+
+            $key_begin = $cursor; //we're currently at the start of the key
+
+            // scroll past all characters that are the key (not whitespace or =)
+            $cursor += strcspn($string, "\x20\x09\x0D\x0A=", $cursor);
+
+            $key_end = $cursor; // now at the end of the key
+
+            $key = substr($string, $key_begin, $key_end - $key_begin);
+
+            if (!$key) continue; // empty key
+
+            // scroll past all whitespace
+            $cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);
+
+            if ($cursor >= $size) {
+                $array[$key] = $key;
+                break;
+            }
+
+            // if the next character is an equal sign, we've got a regular
+            // pair, otherwise, it's a bool attribute
+            $first_char = @$string[$cursor];
+
+            if ($first_char == '=') {
+                // key="value"
 
-                // consume all spaces after the equal sign
-                $position_next_space = $this->nextWhiteSpace($string, $cursor);
-                while ($position_next_space === $cursor) {
-                    $cursor++;
-                    $position_next_space=$this->nextWhiteSpace($string,$cursor);
-                }
+                $cursor++;
+                $cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);
 
-                // if we've hit the end, assign the key an empty value and abort
-                if ($cursor >= $size) {
-                    $array[$key] = '';
-                    break;
-                }
+                // we might be in front of a quote right now
 
-                // find the next quote
-                $position_next_quote = $this->nextQuote($string, $cursor);
+                $char = @$string[$cursor];
 
-                // if the quote is not where the cursor is, we're dealing
-                // with an unquoted attribute
-                if ($position_next_quote !== $cursor) {
-                    if ($key) {
-                        $array[$key] = trim(substr($string, $cursor,
-                          $position_next_space - $cursor));
-                    }
-                    $cursor = $position_next_space + 1;
-                    continue;
+                if ($char == '"' || $char == "'") {
+                    // it's quoted, end bound is $char
+                    $cursor++;
+                    $value_begin = $cursor;
+                    $cursor = strpos($string, $char, $cursor);
+                    $value_end = $cursor;
+                } else {
+                    // it's not quoted, end bound is whitespace
+                    $value_begin = $cursor;
+                    $cursor += strcspn($string, "\x20\x09\x0D\x0A", $cursor);
+                    $value_end = $cursor;
                 }
 
-                // otherwise, regular attribute
-                $quote = $string{$position_next_quote};
-                $position_end_quote = strpos(
-                    $string, $quote, $position_next_quote + 1
-                );
-
-                // check if the ending quote is missing
-                if ($position_end_quote === false) {
-                    // it is, assign it to the end of the string
-                    $position_end_quote = $size;
-                }
+                $value = substr($string, $value_begin, $value_end - $value_begin);
+                $array[$key] = $value;
+                $cursor++;
 
-                $value = substr($string, $position_next_quote + 1,
-                  $position_end_quote - $position_next_quote - 1);
-                if ($key) {
-                    $array[$key] = html_entity_decode($value, ENT_QUOTES);
-                }
-                $cursor = $position_end_quote + 1;
             } else {
-                //boolattr
-                if ($position_next_space === false) {
-                    $position_next_space = $size;
-                }
-                $key = substr($string, $cursor, $position_next_space - $cursor);
-                if ($key) {
+                // boolattr
+                if ($key !== '') {
                     $array[$key] = $key;
                 }
-                $cursor = $position_next_space + 1;
+
             }
         }
         return $array;

diff --git a/library/HTMLPurifier/Token.php b/library/HTMLPurifier/Token.php
@@ -8,36 +8,26 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
 {
     var $is_tag = true;
     var $name;
-    function HTMLPurifier_Token_Tag($name) {
-        // watch out, actually XML is case-sensitive, while HTML
-        // is case insensitive, which means we can't use this for XML
-        $this->name = strtolower($name); // for some reason, the SAX parser
-                                         // uses uppercase. Investigate?
-    }
-}
-
-// a rich tag has attributes
-class HTMLPurifier_Token_RichTag extends HTMLPurifier_Token_Tag // abstract
-{
     var $attributes = array();
-    function HTMLPurifier_Token_RichTag($name, $attributes = array()) {
-        $this->HTMLPurifier_Token_Tag($name);
+    function HTMLPurifier_Token_Tag($name, $attributes = array()) {
+        $this->name = ctype_lower($name) ? $name : strtolower($name);
         $this->attributes = $attributes;
     }
 }
 
 // start CONCRETE ones
 
-class HTMLPurifier_Token_Start extends HTMLPurifier_Token_RichTag
+class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
 {
     var $type = 'start';
 }
 
-class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_RichTag
+class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
 {
     var $type = 'empty';
 }
 
+// accepts attributes even though it really can't, for optimization reasons
 class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
 {
     var $type = 'end';
@@ -51,7 +41,7 @@ class HTMLPurifier_Token_Text extends HTMLPurifier_Token
     var $is_whitespace = false;
     function HTMLPurifier_Token_Text($data) {
         $this->data = $data;
-        if (trim($data, " \n\r\t") === '') $this->is_whitespace = true;
+        if (ctype_space($data)) $this->is_whitespace = true;
     }
     function append($text) {
         return new HTMLPurifier_Token_Text($this->data . $text->data);

diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php
@@ -153,13 +153,18 @@ function test_tokenizeHTML() {
 
         // [SGML-INVALID]
         $input[10] = '<a "=>';
+        // We barf on this, aim for no attributes
         $expect[10] = array(
             new HTMLPurifier_Token_Start('a', array('"' => ''))
             );
-        // DOM doesn't register an invalid attribute
+        // DOM correctly has no attributes, but also closes the tag
         $dom_expect[10] = array(
             new HTMLPurifier_Token_Empty('a')
             );
+        // SAX barfs on this
+        $sax_expect[10] = array(
+            new HTMLPurifier_Token_Start('a', array('"' => ''))
+            );
 
         // [INVALID] [RECOVERABLE]
         $input[11] = '"';
@@ -232,6 +237,18 @@ function test_tokenizeAttributeString() {
         $input[6] = 'href="foo';
         $expect[6] = array('href' => 'foo');
 
+        $input[7] = '"=';
+        $expect[7] = array('"' => '');
+        //           0123456789012345678901234567890123
+        $input[8] = 'href ="about:blank"rel ="nofollow"';
+        $expect[8] = array('href' => 'about:blank', 'rel' => 'nofollow');
+
+        $input[9] = 'foo bar';
+        $expect[9] = array('foo' => 'foo', 'bar' => 'bar');
+
+        $input[10] = 'foo="bar" blue';
+        $expect[10] = array('foo' => 'bar', 'blue' => 'blue');
+
         $size = count($input);
         for($i = 0; $i < $size; $i++) {
             $result = $this->DirectLex->tokenizeAttributeString($input[$i]);