From 110759386e2f9b4d88bf68c669b6c54ad4b5c04f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rcio=20Almada?= Date: Sun, 5 Apr 2015 08:50:35 -0300 Subject: [PATCH] ext tokenizer port + cleanup unused lexer states we basically added a mechanism to store the token stream during parsing and exposed the entire parser stack on the tokenizer extension through an opt in flag: token_get_all($src, TOKEN_PARSE). this change allows easy future language enhancements regarding context aware parsing & scanning without further maintance on the tokenizer extension while solves known inconsistencies "parseless" tokenizer extension has when it handles `__halt_compiler()` presence. --- Zend/zend_compile.c | 6 +- Zend/zend_globals.h | 9 + Zend/zend_language_parser.y | 14 +- Zend/zend_language_scanner.h | 4 + Zend/zend_language_scanner.l | 389 +++++++++--------- .../tests/token_get_all_TOKEN_PARSE_000.phpt | 19 + .../tests/token_get_all_TOKEN_PARSE_001.phpt | 81 ++++ .../tests/token_get_all_TOKEN_PARSE_002.phpt | 68 +++ ext/tokenizer/tests/token_get_all_error.phpt | 8 +- ext/tokenizer/tokenizer.c | 132 +++++- 10 files changed, 510 insertions(+), 220 deletions(-) create mode 100644 ext/tokenizer/tests/token_get_all_TOKEN_PARSE_000.phpt create mode 100644 ext/tokenizer/tests/token_get_all_TOKEN_PARSE_001.phpt create mode 100644 ext/tokenizer/tests/token_get_all_TOKEN_PARSE_002.phpt diff --git a/Zend/zend_compile.c b/Zend/zend_compile.c index c92a25a705389..210810379f58f 100644 --- a/Zend/zend_compile.c +++ b/Zend/zend_compile.c @@ -30,7 +30,6 @@ #include "zend_interfaces.h" #include "zend_virtual_cwd.h" #include "zend_multibyte.h" -#include "zend_language_scanner.h" #include "zend_inheritance.h" #define SET_NODE(target, src) do { \ @@ -568,7 +567,10 @@ static int zend_add_const_name_literal(zend_op_array *op_array, zend_string *nam op.constant = zend_add_literal(CG(active_op_array), &_c); \ } while (0) -void zend_stop_lexing(void) { +void zend_stop_lexing(void) +{ + if(LANG_SCNG(on_event)) LANG_SCNG(on_event)(ON_STOP, END, 0); + LANG_SCNG(yy_cursor) = LANG_SCNG(yy_limit); } diff --git a/Zend/zend_globals.h b/Zend/zend_globals.h index 326955a103b3c..28487a2a4a185 100644 --- a/Zend/zend_globals.h +++ b/Zend/zend_globals.h @@ -249,6 +249,12 @@ struct _zend_ini_scanner_globals { int scanner_mode; }; +typedef enum { + ON_TOKEN, + ON_FEEDBACK, + ON_STOP +} zend_php_scanner_event; + struct _zend_php_scanner_globals { zend_file_handle *yy_in; zend_file_handle *yy_out; @@ -278,6 +284,9 @@ struct _zend_php_scanner_globals { /* initial string length after scanning to first variable */ int scanned_string_len; + + /* hooks */ + void (* on_event)(zend_php_scanner_event event, int token, int line); }; #endif /* ZEND_GLOBALS_H */ diff --git a/Zend/zend_language_parser.y b/Zend/zend_language_parser.y index cefcd0cad928e..f6318ec0c0e33 100644 --- a/Zend/zend_language_parser.y +++ b/Zend/zend_language_parser.y @@ -35,7 +35,7 @@ #include "zend_globals.h" #include "zend_API.h" #include "zend_constants.h" -#include "zend_language_scanner_defs.h" +#include "zend_language_scanner.h" #define YYSIZE_T size_t #define yytnamerr zend_yytnamerr @@ -49,12 +49,6 @@ static YYSIZE_T zend_yytnamerr(char*, const char*); #define YYFREE free #endif -#define REWIND { \ - zend_stack_push(&LANG_SCNG(state_stack), (void *) &LANG_SCNG(yy_state)); \ - LANG_SCNG(yy_state) = yycST_LOOKING_FOR_SEMI_RESERVED_NAME; \ - LANG_SCNG(yy_cursor) = (unsigned char*)LANG_SCNG(yy_text); \ - LANG_SCNG(yy_leng) = 0; } - %} %pure_parser @@ -290,7 +284,11 @@ semi_reserved: identifier: T_STRING { $$ = $1; } - | /* if */ semi_reserved { REWIND } /* and rematch as */ T_STRING { $$ = $3; } + | semi_reserved { + zval zv; + zend_lex_tstring(&zv); + $$ = zend_ast_create_zval(&zv); + } ; top_statement_list: diff --git a/Zend/zend_language_scanner.h b/Zend/zend_language_scanner.h index c82b3069c5906..3b75ff8cc45a0 100644 --- a/Zend/zend_language_scanner.h +++ b/Zend/zend_language_scanner.h @@ -50,6 +50,9 @@ typedef struct _zend_lex_state { zend_encoding_filter output_filter; const zend_encoding *script_encoding; + /* hooks */ + void (* on_event)(zend_php_scanner_event event, int token, int line); + zend_ast *ast; zend_arena *ast_arena; } zend_lex_state; @@ -66,6 +69,7 @@ ZEND_API void zend_restore_lexical_state(zend_lex_state *lex_state); ZEND_API int zend_prepare_string_for_scanning(zval *str, char *filename); ZEND_API void zend_multibyte_yyinput_again(zend_encoding_filter old_input_filter, const zend_encoding *old_encoding); ZEND_API int zend_multibyte_set_filter(const zend_encoding *onetime_encoding); +ZEND_API void zend_lex_tstring(zval *zv); END_EXTERN_C() diff --git a/Zend/zend_language_scanner.l b/Zend/zend_language_scanner.l index 2481af605b7df..cde0621df0e6c 100644 --- a/Zend/zend_language_scanner.l +++ b/Zend/zend_language_scanner.l @@ -193,6 +193,7 @@ void shutdown_scanner(void) zend_stack_destroy(&SCNG(state_stack)); zend_ptr_stack_clean(&SCNG(heredoc_label_stack), (void (*)(void *)) &heredoc_label_dtor, 1); zend_ptr_stack_destroy(&SCNG(heredoc_label_stack)); + SCNG(on_event) = NULL; } ZEND_API void zend_save_lexical_state(zend_lex_state *lex_state) @@ -223,6 +224,8 @@ ZEND_API void zend_save_lexical_state(zend_lex_state *lex_state) lex_state->output_filter = SCNG(output_filter); lex_state->script_encoding = SCNG(script_encoding); + lex_state->on_event = SCNG(on_event); + lex_state->ast = CG(ast); lex_state->ast_arena = CG(ast_arena); } @@ -260,6 +263,8 @@ ZEND_API void zend_restore_lexical_state(zend_lex_state *lex_state) SCNG(output_filter) = lex_state->output_filter; SCNG(script_encoding) = lex_state->script_encoding; + SCNG(on_event) = lex_state->on_event; + CG(ast) = lex_state->ast; CG(ast_arena) = lex_state->ast_arena; @@ -276,6 +281,13 @@ ZEND_API void zend_destroy_file_handle(zend_file_handle *file_handle) } } +ZEND_API void zend_lex_tstring(zval *zv) +{ + if (SCNG(on_event)) SCNG(on_event)(ON_FEEDBACK, T_STRING, 0); + + ZVAL_STRINGL(zv, (char*)SCNG(yy_text), SCNG(yy_leng)); +} + #define BOM_UTF32_BE "\x00\x00\xfe\xff" #define BOM_UTF32_LE "\xff\xfe\x00\x00" #define BOM_UTF16_BE "\xfe\xff" @@ -1083,9 +1095,20 @@ static int zend_scan_escape_string(zval *zendlval, char *str, int len, char quot return SUCCESS; } +static zend_always_inline int emit_token(int token, int token_line) +{ + if(SCNG(on_event)) SCNG(on_event)(ON_TOKEN, token, token_line); + + return token; +} + +#define RETURN_TOKEN(token) return emit_token(token, start_line); int lex_scan(zval *zendlval) { + +int start_line = CG(zend_lineno); + restart: SCNG(yy_text) = YYCURSOR; @@ -1107,183 +1130,183 @@ NEWLINE ("\r"|"\n"|"\r\n") := yyleng = YYCURSOR - SCNG(yy_text); "exit" { - return T_EXIT; + RETURN_TOKEN(T_EXIT); } "die" { - return T_EXIT; + RETURN_TOKEN(T_EXIT); } "function" { - return T_FUNCTION; + RETURN_TOKEN(T_FUNCTION); } "const" { - return T_CONST; + RETURN_TOKEN(T_CONST); } "return" { - return T_RETURN; + RETURN_TOKEN(T_RETURN); } "yield"{WHITESPACE}"from" { - return T_YIELD_FROM; + RETURN_TOKEN(T_YIELD_FROM); } "yield" { - return T_YIELD; + RETURN_TOKEN(T_YIELD); } "try" { - return T_TRY; + RETURN_TOKEN(T_TRY); } "catch" { - return T_CATCH; + RETURN_TOKEN(T_CATCH); } "finally" { - return T_FINALLY; + RETURN_TOKEN(T_FINALLY); } "throw" { - return T_THROW; + RETURN_TOKEN(T_THROW); } "if" { - return T_IF; + RETURN_TOKEN(T_IF); } "elseif" { - return T_ELSEIF; + RETURN_TOKEN(T_ELSEIF); } "endif" { - return T_ENDIF; + RETURN_TOKEN(T_ENDIF); } "else" { - return T_ELSE; + RETURN_TOKEN(T_ELSE); } "while" { - return T_WHILE; + RETURN_TOKEN(T_WHILE); } "endwhile" { - return T_ENDWHILE; + RETURN_TOKEN(T_ENDWHILE); } "do" { - return T_DO; + RETURN_TOKEN(T_DO); } "for" { - return T_FOR; + RETURN_TOKEN(T_FOR); } "endfor" { - return T_ENDFOR; + RETURN_TOKEN(T_ENDFOR); } "foreach" { - return T_FOREACH; + RETURN_TOKEN(T_FOREACH); } "endforeach" { - return T_ENDFOREACH; + RETURN_TOKEN(T_ENDFOREACH); } "declare" { - return T_DECLARE; + RETURN_TOKEN(T_DECLARE); } "enddeclare" { - return T_ENDDECLARE; + RETURN_TOKEN(T_ENDDECLARE); } "instanceof" { - return T_INSTANCEOF; + RETURN_TOKEN(T_INSTANCEOF); } "as" { - return T_AS; + RETURN_TOKEN(T_AS); } "switch" { - return T_SWITCH; + RETURN_TOKEN(T_SWITCH); } "endswitch" { - return T_ENDSWITCH; + RETURN_TOKEN(T_ENDSWITCH); } "case" { - return T_CASE; + RETURN_TOKEN(T_CASE); } "default" { - return T_DEFAULT; + RETURN_TOKEN(T_DEFAULT); } "break" { - return T_BREAK; + RETURN_TOKEN(T_BREAK); } "continue" { - return T_CONTINUE; + RETURN_TOKEN(T_CONTINUE); } "goto" { - return T_GOTO; + RETURN_TOKEN(T_GOTO); } "echo" { - return T_ECHO; + RETURN_TOKEN(T_ECHO); } "print" { - return T_PRINT; + RETURN_TOKEN(T_PRINT); } "class" { - return T_CLASS; + RETURN_TOKEN(T_CLASS); } "interface" { - return T_INTERFACE; + RETURN_TOKEN(T_INTERFACE); } "trait" { - return T_TRAIT; + RETURN_TOKEN(T_TRAIT); } "extends" { - return T_EXTENDS; + RETURN_TOKEN(T_EXTENDS); } "implements" { - return T_IMPLEMENTS; + RETURN_TOKEN(T_IMPLEMENTS); } "->" { yy_push_state(ST_LOOKING_FOR_PROPERTY); - return T_OBJECT_OPERATOR; + RETURN_TOKEN(T_OBJECT_OPERATOR); } -{WHITESPACE}+ { +{WHITESPACE}+ { HANDLE_NEWLINES(yytext, yyleng); - return T_WHITESPACE; + RETURN_TOKEN(T_WHITESPACE); } "->" { - return T_OBJECT_OPERATOR; + RETURN_TOKEN(T_OBJECT_OPERATOR); } {LABEL} { yy_pop_state(); zend_copy_value(zendlval, yytext, yyleng); - return T_STRING; + RETURN_TOKEN(T_STRING); } {ANY_CHAR} { @@ -1293,283 +1316,283 @@ NEWLINE ("\r"|"\n"|"\r\n") } "::" { - return T_PAAMAYIM_NEKUDOTAYIM; + RETURN_TOKEN(T_PAAMAYIM_NEKUDOTAYIM); } "\\" { - return T_NS_SEPARATOR; + RETURN_TOKEN(T_NS_SEPARATOR); } "..." { - return T_ELLIPSIS; + RETURN_TOKEN(T_ELLIPSIS); } "??" { - return T_COALESCE; + RETURN_TOKEN(T_COALESCE); } "new" { - return T_NEW; + RETURN_TOKEN(T_NEW); } "clone" { - return T_CLONE; + RETURN_TOKEN(T_CLONE); } "var" { - return T_VAR; + RETURN_TOKEN(T_VAR); } "("{TABS_AND_SPACES}("int"|"integer"){TABS_AND_SPACES}")" { - return T_INT_CAST; + RETURN_TOKEN(T_INT_CAST); } "("{TABS_AND_SPACES}("real"|"double"|"float"){TABS_AND_SPACES}")" { - return T_DOUBLE_CAST; + RETURN_TOKEN(T_DOUBLE_CAST); } "("{TABS_AND_SPACES}("string"|"binary"){TABS_AND_SPACES}")" { - return T_STRING_CAST; + RETURN_TOKEN(T_STRING_CAST); } "("{TABS_AND_SPACES}"array"{TABS_AND_SPACES}")" { - return T_ARRAY_CAST; + RETURN_TOKEN(T_ARRAY_CAST); } "("{TABS_AND_SPACES}"object"{TABS_AND_SPACES}")" { - return T_OBJECT_CAST; + RETURN_TOKEN(T_OBJECT_CAST); } "("{TABS_AND_SPACES}("bool"|"boolean"){TABS_AND_SPACES}")" { - return T_BOOL_CAST; + RETURN_TOKEN(T_BOOL_CAST); } "("{TABS_AND_SPACES}("unset"){TABS_AND_SPACES}")" { - return T_UNSET_CAST; + RETURN_TOKEN(T_UNSET_CAST); } "eval" { - return T_EVAL; + RETURN_TOKEN(T_EVAL); } "include" { - return T_INCLUDE; + RETURN_TOKEN(T_INCLUDE); } "include_once" { - return T_INCLUDE_ONCE; + RETURN_TOKEN(T_INCLUDE_ONCE); } "require" { - return T_REQUIRE; + RETURN_TOKEN(T_REQUIRE); } "require_once" { - return T_REQUIRE_ONCE; + RETURN_TOKEN(T_REQUIRE_ONCE); } "namespace" { - return T_NAMESPACE; + RETURN_TOKEN(T_NAMESPACE); } "use" { - return T_USE; + RETURN_TOKEN(T_USE); } "insteadof" { - return T_INSTEADOF; + RETURN_TOKEN(T_INSTEADOF); } "global" { - return T_GLOBAL; + RETURN_TOKEN(T_GLOBAL); } "isset" { - return T_ISSET; + RETURN_TOKEN(T_ISSET); } "empty" { - return T_EMPTY; + RETURN_TOKEN(T_EMPTY); } "__halt_compiler" { - return T_HALT_COMPILER; + RETURN_TOKEN(T_HALT_COMPILER); } "static" { - return T_STATIC; + RETURN_TOKEN(T_STATIC); } "abstract" { - return T_ABSTRACT; + RETURN_TOKEN(T_ABSTRACT); } "final" { - return T_FINAL; + RETURN_TOKEN(T_FINAL); } "private" { - return T_PRIVATE; + RETURN_TOKEN(T_PRIVATE); } "protected" { - return T_PROTECTED; + RETURN_TOKEN(T_PROTECTED); } "public" { - return T_PUBLIC; + RETURN_TOKEN(T_PUBLIC); } "unset" { - return T_UNSET; + RETURN_TOKEN(T_UNSET); } "=>" { - return T_DOUBLE_ARROW; + RETURN_TOKEN(T_DOUBLE_ARROW); } "list" { - return T_LIST; + RETURN_TOKEN(T_LIST); } "array" { - return T_ARRAY; + RETURN_TOKEN(T_ARRAY); } "callable" { - return T_CALLABLE; + RETURN_TOKEN(T_CALLABLE); } "++" { - return T_INC; + RETURN_TOKEN(T_INC); } "--" { - return T_DEC; + RETURN_TOKEN(T_DEC); } "===" { - return T_IS_IDENTICAL; + RETURN_TOKEN(T_IS_IDENTICAL); } "!==" { - return T_IS_NOT_IDENTICAL; + RETURN_TOKEN(T_IS_NOT_IDENTICAL); } "==" { - return T_IS_EQUAL; + RETURN_TOKEN(T_IS_EQUAL); } "!="|"<>" { - return T_IS_NOT_EQUAL; + RETURN_TOKEN(T_IS_NOT_EQUAL); } "<=>" { - return T_SPACESHIP; + RETURN_TOKEN(T_SPACESHIP); } "<=" { - return T_IS_SMALLER_OR_EQUAL; + RETURN_TOKEN(T_IS_SMALLER_OR_EQUAL); } ">=" { - return T_IS_GREATER_OR_EQUAL; + RETURN_TOKEN(T_IS_GREATER_OR_EQUAL); } "+=" { - return T_PLUS_EQUAL; + RETURN_TOKEN(T_PLUS_EQUAL); } "-=" { - return T_MINUS_EQUAL; + RETURN_TOKEN(T_MINUS_EQUAL); } "*=" { - return T_MUL_EQUAL; + RETURN_TOKEN(T_MUL_EQUAL); } "*\*" { - return T_POW; + RETURN_TOKEN(T_POW); } "*\*=" { - return T_POW_EQUAL; + RETURN_TOKEN(T_POW_EQUAL); } "/=" { - return T_DIV_EQUAL; + RETURN_TOKEN(T_DIV_EQUAL); } ".=" { - return T_CONCAT_EQUAL; + RETURN_TOKEN(T_CONCAT_EQUAL); } "%=" { - return T_MOD_EQUAL; + RETURN_TOKEN(T_MOD_EQUAL); } "<<=" { - return T_SL_EQUAL; + RETURN_TOKEN(T_SL_EQUAL); } ">>=" { - return T_SR_EQUAL; + RETURN_TOKEN(T_SR_EQUAL); } "&=" { - return T_AND_EQUAL; + RETURN_TOKEN(T_AND_EQUAL); } "|=" { - return T_OR_EQUAL; + RETURN_TOKEN(T_OR_EQUAL); } "^=" { - return T_XOR_EQUAL; + RETURN_TOKEN(T_XOR_EQUAL); } "||" { - return T_BOOLEAN_OR; + RETURN_TOKEN(T_BOOLEAN_OR); } "&&" { - return T_BOOLEAN_AND; + RETURN_TOKEN(T_BOOLEAN_AND); } "OR" { - return T_LOGICAL_OR; + RETURN_TOKEN(T_LOGICAL_OR); } "AND" { - return T_LOGICAL_AND; + RETURN_TOKEN(T_LOGICAL_AND); } "XOR" { - return T_LOGICAL_XOR; + RETURN_TOKEN(T_LOGICAL_XOR); } "<<" { - return T_SL; + RETURN_TOKEN(T_SL); } ">>" { - return T_SR; + RETURN_TOKEN(T_SR); } {TOKENS} { - return yytext[0]; + RETURN_TOKEN(yytext[0]); } "{" { yy_push_state(ST_IN_SCRIPTING); - return '{'; + RETURN_TOKEN('{'); } "${" { yy_push_state(ST_LOOKING_FOR_VARNAME); - return T_DOLLAR_OPEN_CURLY_BRACES; + RETURN_TOKEN(T_DOLLAR_OPEN_CURLY_BRACES); } @@ -1578,7 +1601,7 @@ NEWLINE ("\r"|"\n"|"\r\n") if (!zend_stack_is_empty(&SCNG(state_stack))) { yy_pop_state(); } - return '}'; + RETURN_TOKEN('}'); } @@ -1587,7 +1610,7 @@ NEWLINE ("\r"|"\n"|"\r\n") zend_copy_value(zendlval, yytext, yyleng); yy_pop_state(); yy_push_state(ST_IN_SCRIPTING); - return T_STRING_VARNAME; + RETURN_TOKEN(T_STRING_VARNAME); } @@ -1617,12 +1640,12 @@ NEWLINE ("\r"|"\n"|"\r\n") ZVAL_LONG(zendlval, ZEND_STRTOL(bin, &end, 2)); ZEND_ASSERT(!errno && end == yytext + yyleng); } - return T_LNUMBER; + RETURN_TOKEN(T_LNUMBER); } else { ZVAL_DOUBLE(zendlval, zend_bin_strtod(bin, (const char **)&end)); /* errno isn't checked since we allow HUGE_VAL/INF overflow */ ZEND_ASSERT(end == yytext + yyleng); - return T_DNUMBER; + RETURN_TOKEN(T_DNUMBER); } } @@ -1636,7 +1659,7 @@ NEWLINE ("\r"|"\n"|"\r\n") */ if (end != yytext + yyleng) { zend_throw_exception(zend_get_parse_exception(), "Invalid numeric literal", E_PARSE); - return T_ERROR; + RETURN_TOKEN(T_ERROR); } } else { errno = 0; @@ -1653,19 +1676,19 @@ NEWLINE ("\r"|"\n"|"\r\n") if (end != yytext + yyleng) { zend_throw_exception(zend_get_parse_exception(), "Invalid numeric literal", E_PARSE); - return T_ERROR; + RETURN_TOKEN(T_ERROR); } ZEND_ASSERT(!errno); - return T_DNUMBER; + RETURN_TOKEN(T_DNUMBER); } /* Also not an assert for the same reason */ if (end != yytext + yyleng) { zend_throw_exception(zend_get_parse_exception(), "Invalid numeric literal", E_PARSE); - return T_ERROR; + RETURN_TOKEN(T_ERROR); } } ZEND_ASSERT(!errno); - return T_LNUMBER; + RETURN_TOKEN(T_LNUMBER); } {HNUM} { @@ -1687,12 +1710,12 @@ NEWLINE ("\r"|"\n"|"\r\n") ZVAL_LONG(zendlval, ZEND_STRTOL(hex, &end, 16)); ZEND_ASSERT(!errno && end == hex + len); } - return T_LNUMBER; + RETURN_TOKEN(T_LNUMBER); } else { ZVAL_DOUBLE(zendlval, zend_hex_strtod(hex, (const char **)&end)); /* errno isn't checked since we allow HUGE_VAL/INF overflow */ ZEND_ASSERT(end == hex + len); - return T_DNUMBER; + RETURN_TOKEN(T_DNUMBER); } } @@ -1709,12 +1732,12 @@ NEWLINE ("\r"|"\n"|"\r\n") string: ZVAL_STRINGL(zendlval, yytext, yyleng); } - return T_NUM_STRING; + RETURN_TOKEN(T_NUM_STRING); } {LNUM}|{HNUM}|{BNUM} { /* Offset must be treated as a string */ ZVAL_STRINGL(zendlval, yytext, yyleng); - return T_NUM_STRING; + RETURN_TOKEN(T_NUM_STRING); } {DNUM}|{EXPONENT_DNUM} { @@ -1723,59 +1746,59 @@ string: ZVAL_DOUBLE(zendlval, zend_strtod(yytext, &end)); /* errno isn't checked since we allow HUGE_VAL/INF overflow */ ZEND_ASSERT(end == yytext + yyleng); - return T_DNUMBER; + RETURN_TOKEN(T_DNUMBER); } "__CLASS__" { - return T_CLASS_C; + RETURN_TOKEN(T_CLASS_C); } "__TRAIT__" { - return T_TRAIT_C; + RETURN_TOKEN(T_TRAIT_C); } "__FUNCTION__" { - return T_FUNC_C; + RETURN_TOKEN(T_FUNC_C); } "__METHOD__" { - return T_METHOD_C; + RETURN_TOKEN(T_METHOD_C); } "__LINE__" { - return T_LINE; + RETURN_TOKEN(T_LINE); } "__FILE__" { - return T_FILE; + RETURN_TOKEN(T_FILE); } "__DIR__" { - return T_DIR; + RETURN_TOKEN(T_DIR); } "__NAMESPACE__" { - return T_NS_C; + RETURN_TOKEN(T_NS_C); } """{ANY_CHAR} { if (YYCURSOR > YYLIMIT) { - return 0; + RETURN_TOKEN(END); } inline_char_handler: @@ -1823,7 +1846,7 @@ inline_char_handler: ZVAL_STRINGL(zendlval, yytext, yyleng); } HANDLE_NEWLINES(yytext, yyleng); - return T_INLINE_HTML; + RETURN_TOKEN(T_INLINE_HTML); } @@ -1834,7 +1857,7 @@ inline_char_handler: yyless(yyleng - 3); yy_push_state(ST_LOOKING_FOR_PROPERTY); zend_copy_value(zendlval, (yytext+1), (yyleng-1)); - return T_VARIABLE; + RETURN_TOKEN(T_VARIABLE); } /* A [ always designates a variable offset, regardless of what follows @@ -1843,22 +1866,22 @@ inline_char_handler: yyless(yyleng - 1); yy_push_state(ST_VAR_OFFSET); zend_copy_value(zendlval, (yytext+1), (yyleng-1)); - return T_VARIABLE; + RETURN_TOKEN(T_VARIABLE); } "$"{LABEL} { zend_copy_value(zendlval, (yytext+1), (yyleng-1)); - return T_VARIABLE; + RETURN_TOKEN(T_VARIABLE); } "]" { yy_pop_state(); - return ']'; + RETURN_TOKEN(']'); } {TOKENS}|[{}"`] { /* Only '[' can be valid, but returning other tokens will allow a more explicit parse error */ - return yytext[0]; + RETURN_TOKEN(yytext[0]); } [ \n\r\t\\'#] { @@ -1866,16 +1889,16 @@ inline_char_handler: yyless(0); yy_pop_state(); ZVAL_NULL(zendlval); - return T_ENCAPSED_AND_WHITESPACE; + RETURN_TOKEN(T_ENCAPSED_AND_WHITESPACE); } {LABEL} { zend_copy_value(zendlval, yytext, yyleng); - return T_STRING; + RETURN_TOKEN(T_STRING); } -"#"|"//" { +"#"|"//" { while (YYCURSOR < YYLIMIT) { switch (*YYCURSOR++) { case '\r': @@ -1901,10 +1924,10 @@ inline_char_handler: yyleng = YYCURSOR - SCNG(yy_text); - return T_COMMENT; + RETURN_TOKEN(T_COMMENT); } -"/*"|"/**"{WHITESPACE} { +"/*"|"/**"{WHITESPACE} { int doc_com; if (yyleng > 2) { @@ -1931,27 +1954,15 @@ inline_char_handler: if (doc_com) { CG(doc_comment) = zend_string_init(yytext, yyleng, 0); - return T_DOC_COMMENT; + RETURN_TOKEN(T_DOC_COMMENT); } - return T_COMMENT; -} - -{LABEL} { - zend_copy_value(zendlval, yytext, yyleng); - yy_pop_state(); - return T_STRING; -} - -{ANY_CHAR} { - yyless(0); - yy_pop_state(); - goto restart; + RETURN_TOKEN(T_COMMENT); } "?>"{NEWLINE}? { BEGIN(INITIAL); - return T_CLOSE_TAG; /* implicit ';' at php-end tag */ + RETURN_TOKEN(T_CLOSE_TAG); /* implicit ';' at php-end tag */ } @@ -1977,7 +1988,7 @@ inline_char_handler: * for ' (unrecognized by parser), instead of old flex fallback to "Unexpected character..." * rule, which continued in ST_IN_SCRIPTING state after the quote */ ZVAL_NULL(zendlval); - return T_ENCAPSED_AND_WHITESPACE; + RETURN_TOKEN(T_ENCAPSED_AND_WHITESPACE); } } @@ -2020,7 +2031,7 @@ inline_char_handler: SCNG(output_filter)((unsigned char **)&str, &sz, (unsigned char *)s, (size_t)Z_STRLEN_P(zendlval)); ZVAL_STRINGL(zendlval, str, sz); } - return T_CONSTANT_ENCAPSED_STRING; + RETURN_TOKEN(T_CONSTANT_ENCAPSED_STRING); } @@ -2032,9 +2043,9 @@ inline_char_handler: case '"': yyleng = YYCURSOR - SCNG(yy_text); if (zend_scan_escape_string(zendlval, yytext+bprefix+1, yyleng-bprefix-2, '"') == FAILURE) { - return T_ERROR; + RETURN_TOKEN(T_ERROR); } - return T_CONSTANT_ENCAPSED_STRING; + RETURN_TOKEN(T_CONSTANT_ENCAPSED_STRING); case '$': if (IS_LABEL_START(*YYCURSOR) || *YYCURSOR == '{') { break; @@ -2064,7 +2075,7 @@ inline_char_handler: YYCURSOR = SCNG(yy_text) + yyleng; BEGIN(ST_DOUBLE_QUOTES); - return '"'; + RETURN_TOKEN('"'); } @@ -2112,13 +2123,13 @@ inline_char_handler: zend_ptr_stack_push(&SCNG(heredoc_label_stack), (void *) heredoc_label); - return T_START_HEREDOC; + RETURN_TOKEN(T_START_HEREDOC); } [`] { BEGIN(ST_BACKQUOTE); - return '`'; + RETURN_TOKEN('`'); } @@ -2132,7 +2143,7 @@ inline_char_handler: efree(heredoc_label); BEGIN(ST_IN_SCRIPTING); - return T_END_HEREDOC; + RETURN_TOKEN(T_END_HEREDOC); } @@ -2140,18 +2151,18 @@ inline_char_handler: Z_LVAL_P(zendlval) = (zend_long) '{'; yy_push_state(ST_IN_SCRIPTING); yyless(1); - return T_CURLY_OPEN; + RETURN_TOKEN(T_CURLY_OPEN); } ["] { BEGIN(ST_IN_SCRIPTING); - return '"'; + RETURN_TOKEN('"'); } [`] { BEGIN(ST_IN_SCRIPTING); - return '`'; + RETURN_TOKEN('`'); } @@ -2164,7 +2175,7 @@ inline_char_handler: } if (YYCURSOR > YYLIMIT) { - return 0; + RETURN_TOKEN(END); } if (yytext[0] == '\\' && YYCURSOR < YYLIMIT) { YYCURSOR++; @@ -2201,15 +2212,15 @@ double_quotes_scan_done: yyleng = YYCURSOR - SCNG(yy_text); if (zend_scan_escape_string(zendlval, yytext, yyleng, '"') == FAILURE) { - return T_ERROR; + RETURN_TOKEN(T_ERROR); } - return T_ENCAPSED_AND_WHITESPACE; + RETURN_TOKEN(T_ENCAPSED_AND_WHITESPACE); } {ANY_CHAR} { if (YYCURSOR > YYLIMIT) { - return 0; + RETURN_TOKEN(END); } if (yytext[0] == '\\' && YYCURSOR < YYLIMIT) { YYCURSOR++; @@ -2245,9 +2256,9 @@ double_quotes_scan_done: yyleng = YYCURSOR - SCNG(yy_text); if (zend_scan_escape_string(zendlval, yytext, yyleng, '`') == FAILURE) { - return T_ERROR; + RETURN_TOKEN(T_ERROR); } - return T_ENCAPSED_AND_WHITESPACE; + RETURN_TOKEN(T_ENCAPSED_AND_WHITESPACE); } @@ -2257,7 +2268,7 @@ double_quotes_scan_done: zend_heredoc_label *heredoc_label = zend_ptr_stack_top(&SCNG(heredoc_label_stack)); if (YYCURSOR > YYLIMIT) { - return 0; + RETURN_TOKEN(END); } YYCURSOR--; @@ -2321,9 +2332,9 @@ heredoc_scan_done: yyleng = YYCURSOR - SCNG(yy_text); if (zend_scan_escape_string(zendlval, yytext, yyleng - newline, 0) == FAILURE) { - return T_ERROR; + RETURN_TOKEN(T_ERROR); } - return T_ENCAPSED_AND_WHITESPACE; + RETURN_TOKEN(T_ENCAPSED_AND_WHITESPACE); } @@ -2333,7 +2344,7 @@ heredoc_scan_done: zend_heredoc_label *heredoc_label = zend_ptr_stack_top(&SCNG(heredoc_label_stack)); if (YYCURSOR > YYLIMIT) { - return 0; + RETURN_TOKEN(END); } YYCURSOR--; @@ -2380,13 +2391,13 @@ nowdoc_scan_done: zend_copy_value(zendlval, yytext, yyleng - newline); HANDLE_NEWLINES(yytext, yyleng - newline); - return T_ENCAPSED_AND_WHITESPACE; + RETURN_TOKEN(T_ENCAPSED_AND_WHITESPACE); } {ANY_CHAR} { if (YYCURSOR > YYLIMIT) { - return 0; + RETURN_TOKEN(END); } zend_error(E_COMPILE_WARNING,"Unexpected character in input: '%c' (ASCII=%d) state=%d", yytext[0], yytext[0], YYSTATE); diff --git a/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_000.phpt b/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_000.phpt new file mode 100644 index 0000000000000..03b991b1a5db7 --- /dev/null +++ b/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_000.phpt @@ -0,0 +1,19 @@ +--TEST-- +Parse errors during token_get_all() with TOKEN_PARSE flag +--SKIPIF-- + +--FILE-- +getMessage(), PHP_EOL; +} + +echo "Done"; + +?> +--EXPECT-- +syntax error, unexpected 'code' (T_STRING) +Done diff --git a/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_001.phpt b/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_001.phpt new file mode 100644 index 0000000000000..ab334358abc04 --- /dev/null +++ b/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_001.phpt @@ -0,0 +1,81 @@ +--TEST-- +Semi reserved words support: member access +--SKIPIF-- + +--FILE-- +$continue; +X::continue(); +$x->continue(); +X::class; + +class X { + const CONTINUE = 1; + public $x = self::CONTINUE + 1; +} +', TOKEN_PARSE); + +array_walk($tokens, function($tk) { + if(is_array($tk)) { + if(($t = token_name($tk[0])) == 'T_WHITESPACE') return; + echo "L{$tk[2]}: ".$t." {$tk[1]}", PHP_EOL; + } + else echo $tk, PHP_EOL; +}); + +echo "Done"; + +?> +--EXPECTF-- +L1: T_OPEN_TAG +L4: T_VARIABLE $continue +; +L5: T_STRING X +L5: T_DOUBLE_COLON :: +L5: T_STRING continue +( +) +; +L6: T_VARIABLE $x +L6: T_OBJECT_OPERATOR -> +L6: T_STRING continue +( +) +; +L7: T_STRING X +L7: T_DOUBLE_COLON :: +L7: T_CLASS class +; +L9: T_CLASS class +L9: T_STRING X +{ +L10: T_CONST const +L10: T_STRING CONTINUE += +L10: T_LNUMBER 1 +; +L11: T_PUBLIC public +L11: T_VARIABLE $x += +L11: T_STRING self +L11: T_DOUBLE_COLON :: +L11: T_STRING CONTINUE ++ +L11: T_LNUMBER 1 +; +} +Done diff --git a/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_002.phpt b/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_002.phpt new file mode 100644 index 0000000000000..3dd8e14d8423a --- /dev/null +++ b/ext/tokenizer/tests/token_get_all_TOKEN_PARSE_002.phpt @@ -0,0 +1,68 @@ +--TEST-- +Semi reserved words support: class const +--SKIPIF-- + +--FILE-- + [3, 4], 5]; + } +', TOKEN_PARSE); + +array_walk($tokens, function($tk) { + if(is_array($tk)) { + if(($t = token_name($tk[0])) == 'T_WHITESPACE') return; + echo "L{$tk[2]}: ".$t." {$tk[1]}", PHP_EOL; + } + else echo $tk, PHP_EOL; +}); + +echo "Done"; + +?> +--EXPECTF-- +L1: T_OPEN_TAG +[ +L5: T_LNUMBER 3 +, +L5: T_LNUMBER 4 +] +, +L5: T_LNUMBER 5 +] +; +} +Done diff --git a/ext/tokenizer/tests/token_get_all_error.phpt b/ext/tokenizer/tests/token_get_all_error.phpt index 29e97c38c4071..9ded0a177425f 100644 --- a/ext/tokenizer/tests/token_get_all_error.phpt +++ b/ext/tokenizer/tests/token_get_all_error.phpt @@ -19,7 +19,7 @@ var_dump( token_get_all()); echo "-- Testing token_get_all() function with more than expected no. of arguments --\n"; $source = ''; $extra_arg = 10; -var_dump( token_get_all($source, $extra_arg)); +var_dump( token_get_all($source, true, $extra_arg)); echo "Done" ?> @@ -28,10 +28,10 @@ echo "Done" -- Testing token_get_all() function with zero arguments -- -Warning: token_get_all() expects exactly 1 parameter, 0 given in %s on line %d +Warning: token_get_all() expects at least 1 parameter, 0 given in %s on line 11 NULL -- Testing token_get_all() function with more than expected no. of arguments -- -Warning: token_get_all() expects exactly 1 parameter, 2 given in %s on line %d +Warning: token_get_all() expects at most 2 parameters, 3 given in %s on line 17 NULL -Done +Done \ No newline at end of file diff --git a/ext/tokenizer/tokenizer.c b/ext/tokenizer/tokenizer.c index c4b9d14359fd9..2a4fa90ca2798 100644 --- a/ext/tokenizer/tokenizer.c +++ b/ext/tokenizer/tokenizer.c @@ -37,6 +37,12 @@ #define zendcursor LANG_SCNG(yy_cursor) #define zendlimit LANG_SCNG(yy_limit) +#define TOKEN_PARSE 1 + +void tokenizer_token_get_all_register_constants(INIT_FUNC_ARGS) { + REGISTER_LONG_CONSTANT("TOKEN_PARSE", TOKEN_PARSE, CONST_CS|CONST_PERSISTENT); +} + /* {{{ arginfo */ ZEND_BEGIN_ARG_INFO_EX(arginfo_token_get_all, 0, 0, 1) ZEND_ARG_INFO(0, source) @@ -83,6 +89,7 @@ ZEND_GET_MODULE(tokenizer) PHP_MINIT_FUNCTION(tokenizer) { tokenizer_register_constants(INIT_FUNC_ARGS_PASSTHRU); + tokenizer_token_get_all_register_constants(INIT_FUNC_ARGS_PASSTHRU); return SUCCESS; } /* }}} */ @@ -97,8 +104,10 @@ PHP_MINFO_FUNCTION(tokenizer) } /* }}} */ -static void tokenize(zval *return_value) +static zend_bool tokenize(zval *return_value, zend_string *source) { + zval source_zval; + zend_lex_state original_lex_state; zval token; zval keyword; int token_type; @@ -106,10 +115,22 @@ static void tokenize(zval *return_value) int token_line = 1; int need_tokens = -1; /* for __halt_compiler lexing. -1 = disabled */ + ZVAL_STR_COPY(&source_zval, source); + zend_save_lexical_state(&original_lex_state); + + if (zend_prepare_string_for_scanning(&source_zval, "") == FAILURE) { + zend_restore_lexical_state(&original_lex_state); + return 0; + } + + LANG_SCNG(yy_state) = yycINITIAL; array_init(return_value); ZVAL_NULL(&token); while ((token_type = lex_scan(&token))) { + + if(token_type == T_ERROR) break; + destroy = 1; switch (token_type) { case T_CLOSE_TAG: @@ -123,8 +144,6 @@ static void tokenize(zval *return_value) case T_DOC_COMMENT: destroy = 0; break; - case T_ERROR: - return; } if (token_type >= 256) { @@ -169,34 +188,113 @@ static void tokenize(zval *return_value) token_line = CG(zend_lineno); } + + zval_dtor(&source_zval); + zend_restore_lexical_state(&original_lex_state); + + return 1; } -/* {{{ proto array token_get_all(string source) - */ -PHP_FUNCTION(token_get_all) +zval token_stream; + +void on_event(zend_php_scanner_event event, int token, int line) { - zend_string *source; - zval source_zval; - zend_lex_state original_lex_state; + zval keyword; + HashTable *tokens_ht; + zval *token_zv; - if (zend_parse_parameters(ZEND_NUM_ARGS(), "S", &source) == FAILURE) { - return; + switch(event) { + case ON_TOKEN: + if (token == T_ERROR || token == END) break; + if (token >= 256) { + array_init(&keyword); + add_next_index_long(&keyword, token); + add_next_index_stringl(&keyword, (char *)LANG_SCNG(yy_text), LANG_SCNG(yy_leng)); + add_next_index_long(&keyword, line); + add_next_index_zval(&token_stream, &keyword); + } else { + add_next_index_stringl(&token_stream, (char *)LANG_SCNG(yy_text), LANG_SCNG(yy_leng)); + } + break; + case ON_FEEDBACK: + tokens_ht = Z_ARRVAL(token_stream); + token_zv = zend_hash_index_find(tokens_ht, zend_hash_num_elements(tokens_ht) - 1); + if (token_zv && Z_TYPE_P(token_zv) == IS_ARRAY) { + ZVAL_LONG(zend_hash_index_find(Z_ARRVAL_P(token_zv), 0), token); + } + break; + case ON_STOP: + if (LANG_SCNG(yy_cursor) != LANG_SCNG(yy_limit)) { + array_init(&keyword); + add_next_index_long(&keyword, T_INLINE_HTML); + add_next_index_stringl(&keyword, + (char *)LANG_SCNG(yy_cursor), LANG_SCNG(yy_limit) - LANG_SCNG(yy_cursor)); + add_next_index_long(&keyword, CG(zend_lineno)); + add_next_index_zval(&token_stream, &keyword); + } + break; } +} + +static zend_bool tokenize_parse(zval *return_value, zend_string *source) +{ + zval source_zval; + zend_lex_state original_lex_state; + zend_bool original_in_compilation; + zend_bool success; ZVAL_STR_COPY(&source_zval, source); + + original_in_compilation = CG(in_compilation); + CG(in_compilation) = 1; zend_save_lexical_state(&original_lex_state); - if (zend_prepare_string_for_scanning(&source_zval, "") == FAILURE) { - zend_restore_lexical_state(&original_lex_state); - RETURN_FALSE; - } + if ((success = (zend_prepare_string_for_scanning(&source_zval, "") == SUCCESS))) { + CG(ast) = NULL; + CG(ast_arena) = zend_arena_create(1024 * 32); + LANG_SCNG(yy_state) = yycINITIAL; + LANG_SCNG(on_event) = on_event; - LANG_SCNG(yy_state) = yycINITIAL; + array_init(&token_stream); + if((success = (zendparse() == SUCCESS))) { + ZVAL_ZVAL(return_value, &token_stream, 1, 0); + } + zval_dtor(&token_stream); - tokenize(return_value); + zend_ast_destroy(CG(ast)); + zend_arena_destroy(CG(ast_arena)); + } + /* restore compiler and scanner global states */ zend_restore_lexical_state(&original_lex_state); + CG(in_compilation) = original_in_compilation; + zval_dtor(&source_zval); + + return success; +} + +/* }}} */ + +/* {{{ proto array token_get_all(string source) + */ +PHP_FUNCTION(token_get_all) +{ + zend_string *source; + zend_long flags = 0; + zend_bool success; + + if (zend_parse_parameters(ZEND_NUM_ARGS(), "S|l", &source, &flags) == FAILURE) { + return; + } + + if (flags & TOKEN_PARSE) { + success = tokenize_parse(return_value, source); + } else { + success = tokenize(return_value, source); + } + + if (!success) RETURN_FALSE; } /* }}} */