From 1fac9fed647909b92f3779dd34beb8564f6f247b Mon Sep 17 00:00:00 2001 From: David Beazley Date: Sat, 22 Feb 2020 15:57:54 -0600 Subject: [PATCH] Massive refactoring/cleanup --- CHANGES | 36 +- CONTRIBUTING.md | 18 +- README.md | 10 +- doc/internal.html | 874 ------- doc/makedoc.py | 194 -- doc/ply.html | 3496 -------------------------- docs/Makefile | 192 ++ docs/conf.py | 284 +++ docs/index.rst | 58 + docs/internals.rst | 530 ++++ docs/make.bat | 263 ++ docs/ply.rst | 2656 +++++++++++++++++++ example/BASIC/basic.py | 7 +- example/GardenSnake/GardenSnake.py | 777 ------ example/GardenSnake/README | 5 - example/README | 1 - example/calc/calc.py | 14 +- example/calcdebug/calc.py | 7 +- example/calceof/calc.py | 9 +- example/classcalc/calc.py | 13 +- example/closurecalc/calc.py | 5 +- example/hedit/hedit.py | 48 - example/newclasscalc/calc.py | 167 -- example/optcalc/README | 9 - example/optcalc/calc.py | 134 - example/unicalc/calc.py | 133 - example/yply/yparse.py | 2 +- ply/__init__.py | 1 + ply/lex.py | 277 +- ply/yacc.py | 1364 ++-------- ply/ygen.py | 69 - setup.md | 14 - test/lex_many_tokens.py | 2 +- test/lex_opt_alias.py | 54 - test/lex_optimize.py | 50 - test/lex_optimize2.py | 50 - test/lex_optimize3.py | 52 - test/lex_optimize4.py | 26 - test/lex_token5.py | 31 - test/pkg_test1/__init__.py | 9 - test/pkg_test1/parsing/__init__.py | 0 test/pkg_test1/parsing/calclex.py | 47 - test/pkg_test1/parsing/calcparse.py | 66 - test/pkg_test2/__init__.py | 9 - test/pkg_test2/parsing/__init__.py | 0 test/pkg_test2/parsing/calclex.py | 47 - test/pkg_test2/parsing/calcparse.py | 66 - test/pkg_test3/__init__.py | 9 - test/pkg_test3/generated/__init__.py | 0 test/pkg_test3/parsing/__init__.py | 0 test/pkg_test3/parsing/calclex.py | 47 - test/pkg_test3/parsing/calcparse.py | 66 - test/pkg_test4/__init__.py | 25 - test/pkg_test4/parsing/__init__.py | 0 test/pkg_test4/parsing/calclex.py | 47 - test/pkg_test4/parsing/calcparse.py | 66 - test/pkg_test5/__init__.py | 9 - test/pkg_test5/parsing/__init__.py | 0 test/pkg_test5/parsing/calclex.py | 48 - test/pkg_test5/parsing/calcparse.py | 67 - test/pkg_test6/__init__.py | 9 - test/pkg_test6/parsing/__init__.py | 0 test/pkg_test6/parsing/calclex.py | 48 - test/pkg_test6/parsing/calcparse.py | 33 - test/pkg_test6/parsing/expression.py | 31 - test/pkg_test6/parsing/statement.py | 9 - test/testlex.py | 282 +-- test/testyacc.py | 48 - test/yacc_error7.py | 4 +- test/yacc_nested.py | 2 +- 70 files changed, 4263 insertions(+), 8763 deletions(-) delete mode 100644 doc/internal.html delete mode 100644 doc/makedoc.py delete mode 100644 doc/ply.html create mode 100644 docs/Makefile create mode 100644 docs/conf.py create mode 100644 docs/index.rst create mode 100644 docs/internals.rst create mode 100644 docs/make.bat create mode 100644 docs/ply.rst delete mode 100644 example/GardenSnake/GardenSnake.py delete mode 100644 example/GardenSnake/README delete mode 100644 example/hedit/hedit.py delete mode 100755 example/newclasscalc/calc.py delete mode 100644 example/optcalc/README delete mode 100644 example/optcalc/calc.py delete mode 100644 example/unicalc/calc.py delete mode 100644 ply/ygen.py delete mode 100644 test/lex_opt_alias.py delete mode 100644 test/lex_optimize.py delete mode 100644 test/lex_optimize2.py delete mode 100644 test/lex_optimize3.py delete mode 100644 test/lex_optimize4.py delete mode 100644 test/lex_token5.py delete mode 100644 test/pkg_test1/__init__.py delete mode 100644 test/pkg_test1/parsing/__init__.py delete mode 100644 test/pkg_test1/parsing/calclex.py delete mode 100644 test/pkg_test1/parsing/calcparse.py delete mode 100644 test/pkg_test2/__init__.py delete mode 100644 test/pkg_test2/parsing/__init__.py delete mode 100644 test/pkg_test2/parsing/calclex.py delete mode 100644 test/pkg_test2/parsing/calcparse.py delete mode 100644 test/pkg_test3/__init__.py delete mode 100644 test/pkg_test3/generated/__init__.py delete mode 100644 test/pkg_test3/parsing/__init__.py delete mode 100644 test/pkg_test3/parsing/calclex.py delete mode 100644 test/pkg_test3/parsing/calcparse.py delete mode 100644 test/pkg_test4/__init__.py delete mode 100644 test/pkg_test4/parsing/__init__.py delete mode 100644 test/pkg_test4/parsing/calclex.py delete mode 100644 test/pkg_test4/parsing/calcparse.py delete mode 100644 test/pkg_test5/__init__.py delete mode 100644 test/pkg_test5/parsing/__init__.py delete mode 100644 test/pkg_test5/parsing/calclex.py delete mode 100644 test/pkg_test5/parsing/calcparse.py delete mode 100644 test/pkg_test6/__init__.py delete mode 100644 test/pkg_test6/parsing/__init__.py delete mode 100644 test/pkg_test6/parsing/calclex.py delete mode 100644 test/pkg_test6/parsing/calcparse.py delete mode 100644 test/pkg_test6/parsing/expression.py delete mode 100644 test/pkg_test6/parsing/statement.py diff --git a/CHANGES b/CHANGES index 471331a..749d16a 100644 --- a/CHANGES +++ b/CHANGES @@ -4,10 +4,42 @@ IMPORTANT NOTE (2018-12-22): PLY is no longer be released in any package-installable format. If you want to use the latest version, you need to COPY the contents of the ply/ directory into your own project and use it. Although PLY is no longer distributed as a package, it is -maintained as a mature library. No new features are planned, but -issues and pull requests for bugs are still welcome. Any changes to the +maintained as a mature library. No new major features are planned, but +issues reported for bugs are still welcome. Any changes to the software will be noted here. +Version 4.0 (In progress) +------------------------- +Note: The 4.0 series of PLY represents a massive cleanup and modernization +effort. At a fundamental level, no new "features" are being added. +Instead, a lot of outdated, inconsistent, and problematic features are +being eliminated. Here is a short summary: + + - PLY no longer writes table files or cached data. If you want this, + it's your responsibility to serialize the parser tables. Use pickle. + + - Elimination of side-effects and global variables (generally). + + - Elimination of numerous optional features in an effort to + simplify the API. + + - More use of modern Python features including iterators/generators, + keyword-only arguments, f-strings, etc. + + - Dropped support for Python 2.x +------------------------ + +01/26/20 PLY no longer writes cached table files. Honestly, the use of + the cached files made more sense when I was developing PLY on + my 200Mhz PC in 2001. It's not as much as an issue now. For small + to medium sized grammars, PLY should be almost instantaneous. + If you're working with a large grammar, you can arrange + to pickle the associated grammar instance yourself if need be. + The removal of table files eliminated a large number of optional + arguments to yacc() concerning the names and packages of these files. + +01/26/20 PLY no longer supports Python 2.x. + 01/01/20 Added an install.py script to make it easy to install PLY into virtual environment if you just want to play with it. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7a62354..3e1cad6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -3,19 +3,5 @@ Contributing to PLY PLY is a mature project. However, if you feel that you have found a bug in PLY or its documentation, please submit an issue in the form -of a bug report. - -Important note: The Github repo for PLY always contains the most -up-to-date version of the software. If you want to use the current -version, you should COPY the contents of the `ply/` directory into -your own project and use it. PLY is free software for you to use, -but it is not maintained as a package that you install using pip -or similar tools. - - - - - - - - +of a bug report at https://github.com/dabeaz/ply. Pull requests +to the project are not accepted. diff --git a/README.md b/README.md index 995f597..dca94d8 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ virtual environment. PLY has no third-party dependencies. -The file doc/ply.html contains complete documentation on how to use +The docs/ directory contains complete documentation on how to use the system. The example directory contains several different examples including a @@ -176,7 +176,7 @@ def t_newline(t): t.lexer.lineno += t.value.count("\n") def t_error(t): - print("Illegal character '%s'" % t.value[0]) + print(f"Illegal character {t.value[0]!r}") t.lexer.skip(1) # Build the lexer @@ -228,18 +228,18 @@ def p_expression_name(p): try: p[0] = names[p[1]] except LookupError: - print("Undefined name '%s'" % p[1]) + print(f"Undefined name {p[1]!r}") p[0] = 0 def p_error(p): - print("Syntax error at '%s'" % p.value) + print(f"Syntax error at {p.value!r}") import ply.yacc as yacc yacc.yacc() while True: try: - s = raw_input('calc > ') # use input() on Python 3 + s = input('calc > ') except EOFError: break yacc.parse(s) diff --git a/doc/internal.html b/doc/internal.html deleted file mode 100644 index 57e87df..0000000 --- a/doc/internal.html +++ /dev/null @@ -1,874 +0,0 @@ - - -PLY Internals - - - -

PLY Internals

- - -David M. Beazley
-dave@dabeaz.com
-
- -

-PLY Version: 3.11 -

- - -

- -
- - - -

1. Introduction

- - -This document describes classes and functions that make up the internal -operation of PLY. Using this programming interface, it is possible to -manually build an parser using a different interface specification -than what PLY normally uses. For example, you could build a gramar -from information parsed in a completely different input format. Some of -these objects may be useful for building more advanced parsing engines -such as GLR. - -

-It should be stressed that using PLY at this level is not for the -faint of heart. Generally, it's assumed that you know a bit of -the underlying compiler theory and how an LR parser is put together. - -

2. Grammar Class

- - -The file ply.yacc defines a class Grammar that -is used to hold and manipulate information about a grammar -specification. It encapsulates the same basic information -about a grammar that is put into a YACC file including -the list of tokens, precedence rules, and grammar rules. -Various operations are provided to perform different validations -on the grammar. In addition, there are operations to compute -the first and follow sets that are needed by the various table -generation algorithms. - -

-Grammar(terminals) - -

-Creates a new grammar object. terminals is a list of strings -specifying the terminals for the grammar. An instance g of -Grammar has the following methods: -
- -

-g.set_precedence(term,assoc,level) -

-Sets the precedence level and associativity for a given terminal term. -assoc is one of 'right', -'left', or 'nonassoc' and level is a positive integer. The higher -the value of level, the higher the precedence. Here is an example of typical -precedence settings: - -
-g.set_precedence('PLUS',  'left',1)
-g.set_precedence('MINUS', 'left',1)
-g.set_precedence('TIMES', 'left',2)
-g.set_precedence('DIVIDE','left',2)
-g.set_precedence('UMINUS','left',3)
-
- -This method must be called prior to adding any productions to the -grammar with g.add_production(). The precedence of individual grammar -rules is determined by the precedence of the right-most terminal. - -
-

-g.add_production(name,syms,func=None,file='',line=0) -

-Adds a new grammar rule. name is the name of the rule, -syms is a list of symbols making up the right hand -side of the rule, func is the function to call when -reducing the rule. file and line specify -the filename and line number of the rule and are used for -generating error messages. - -

-The list of symbols in syms may include character -literals and %prec specifiers. Here are some -examples: - -

-g.add_production('expr',['expr','PLUS','term'],func,file,line)
-g.add_production('expr',['expr','"+"','term'],func,file,line)
-g.add_production('expr',['MINUS','expr','%prec','UMINUS'],func,file,line)
-
- -

-If any kind of error is detected, a GrammarError exception -is raised with a message indicating the reason for the failure. -

- -

-g.set_start(start=None) -

-Sets the starting rule for the grammar. start is a string -specifying the name of the start rule. If start is omitted, -the first grammar rule added with add_production() is taken to be -the starting rule. This method must always be called after all -productions have been added. -
- -

-g.find_unreachable() -

-Diagnostic function. Returns a list of all unreachable non-terminals -defined in the grammar. This is used to identify inactive parts of -the grammar specification. -
- -

-g.infinite_cycle() -

-Diagnostic function. Returns a list of all non-terminals in the -grammar that result in an infinite cycle. This condition occurs if -there is no way for a grammar rule to expand to a string containing -only terminal symbols. -
- -

-g.undefined_symbols() -

-Diagnostic function. Returns a list of tuples (name, prod) -corresponding to undefined symbols in the grammar. name is the -name of the undefined symbol and prod is an instance of -Production which has information about the production rule -where the undefined symbol was used. -
- -

-g.unused_terminals() -

-Diagnostic function. Returns a list of terminals that were defined, -but never used in the grammar. -
- -

-g.unused_rules() -

-Diagnostic function. Returns a list of Production instances -corresponding to production rules that were defined in the grammar, -but never used anywhere. This is slightly different -than find_unreachable(). -
- -

-g.unused_precedence() -

-Diagnostic function. Returns a list of tuples (term, assoc) -corresponding to precedence rules that were set, but never used the -grammar. term is the terminal name and assoc is the -precedence associativity (e.g., 'left', 'right', -or 'nonassoc'. -
- -

-g.compute_first() -

-Compute all of the first sets for all symbols in the grammar. Returns a dictionary -mapping symbol names to a list of all first symbols. -
- -

-g.compute_follow() -

-Compute all of the follow sets for all non-terminals in the grammar. -The follow set is the set of all possible symbols that might follow a -given non-terminal. Returns a dictionary mapping non-terminal names -to a list of symbols. -
- -

-g.build_lritems() -

-Calculates all of the LR items for all productions in the grammar. This -step is required before using the grammar for any kind of table generation. -See the section on LR items below. -
- -

-The following attributes are set by the above methods and may be useful -in code that works with the grammar. All of these attributes should be -assumed to be read-only. Changing their values directly will likely -break the grammar. - -

-g.Productions -

-A list of all productions added. The first entry is reserved for -a production representing the starting rule. The objects in this list -are instances of the Production class, described shortly. -
- -

-g.Prodnames -

-A dictionary mapping the names of nonterminals to a list of all -productions of that nonterminal. -
- -

-g.Terminals -

-A dictionary mapping the names of terminals to a list of the -production numbers where they are used. -
- -

-g.Nonterminals -

-A dictionary mapping the names of nonterminals to a list of the -production numbers where they are used. -
- -

-g.First -

-A dictionary representing the first sets for all grammar symbols. This is -computed and returned by the compute_first() method. -
- -

-g.Follow -

-A dictionary representing the follow sets for all grammar rules. This is -computed and returned by the compute_follow() method. -
- -

-g.Start -

-Starting symbol for the grammar. Set by the set_start() method. -
- -For the purposes of debugging, a Grammar object supports the __len__() and -__getitem__() special methods. Accessing g[n] returns the nth production -from the grammar. - - -

3. Productions

- - -Grammar objects store grammar rules as instances of a Production class. This -class has no public constructor--you should only create productions by calling Grammar.add_production(). -The following attributes are available on a Production instance p. - -

-p.name -

-The name of the production. For a grammar rule such as A : B C D, this is 'A'. -
- -

-p.prod -

-A tuple of symbols making up the right-hand side of the production. For a grammar rule such as A : B C D, this is ('B','C','D'). -
- -

-p.number -

-Production number. An integer containing the index of the production in the grammar's Productions list. -
- -

-p.func -

-The name of the reduction function associated with the production. -This is the function that will execute when reducing the entire -grammar rule during parsing. -
- -

-p.callable -

-The callable object associated with the name in p.func. This is None -unless the production has been bound using bind(). -
- -

-p.file -

-Filename associated with the production. Typically this is the file where the production was defined. Used for error messages. -
- -

-p.lineno -

-Line number associated with the production. Typically this is the line number in p.file where the production was defined. Used for error messages. -
- -

-p.prec -

-Precedence and associativity associated with the production. This is a tuple (assoc,level) where -assoc is one of 'left','right', or 'nonassoc' and level is -an integer. This value is determined by the precedence of the right-most terminal symbol in the production -or by use of the %prec specifier when adding the production. -
- -

-p.usyms -

-A list of all unique symbols found in the production. -
- -

-p.lr_items -

-A list of all LR items for this production. This attribute only has a meaningful value if the -Grammar.build_lritems() method has been called. The items in this list are -instances of LRItem described below. -
- -

-p.lr_next -

-The head of a linked-list representation of the LR items in p.lr_items. -This attribute only has a meaningful value if the Grammar.build_lritems() -method has been called. Each LRItem instance has a lr_next attribute -to move to the next item. The list is terminated by None. -
- -

-p.bind(dict) -

-Binds the production function name in p.func to a callable object in -dict. This operation is typically carried out in the last step -prior to running the parsing engine and is needed since parsing tables are typically -read from files which only include the function names, not the functions themselves. -
- -

-Production objects support -the __len__(), __getitem__(), and __str__() -special methods. -len(p) returns the number of symbols in p.prod -and p[n] is the same as p.prod[n]. - -

4. LRItems

- - -The construction of parsing tables in an LR-based parser generator is primarily -done over a set of "LR Items". An LR item represents a stage of parsing one -of the grammar rules. To compute the LR items, it is first necessary to -call Grammar.build_lritems(). Once this step, all of the productions -in the grammar will have their LR items attached to them. - -

-Here is an interactive example that shows what LR items look like if you -interactively experiment. In this example, g is a Grammar -object. - -

-
->>> g.build_lritems()
->>> p = g[1]
->>> p
-Production(statement -> ID = expr)
->>>
-
-
- -In the above code, p represents the first grammar rule. In -this case, a rule 'statement -> ID = expr'. - -

-Now, let's look at the LR items for p. - -

-
->>> p.lr_items
-[LRItem(statement -> . ID = expr), 
- LRItem(statement -> ID . = expr), 
- LRItem(statement -> ID = . expr), 
- LRItem(statement -> ID = expr .)]
->>>
-
-
- -In each LR item, the dot (.) represents a specific stage of parsing. In each LR item, the dot -is advanced by one symbol. It is only when the dot reaches the very end that a production -is successfully parsed. - -

-An instance lr of LRItem has the following -attributes that hold information related to that specific stage of -parsing. - -

-lr.name -

-The name of the grammar rule. For example, 'statement' in the above example. -
- -

-lr.prod -

-A tuple of symbols representing the right-hand side of the production, including the -special '.' character. For example, ('ID','.','=','expr'). -
- -

-lr.number -

-An integer representing the production number in the grammar. -
- -

-lr.usyms -

-A set of unique symbols in the production. Inherited from the original Production instance. -
- -

-lr.lr_index -

-An integer representing the position of the dot (.). You should never use lr.prod.index() -to search for it--the result will be wrong if the grammar happens to also use (.) as a character -literal. -
- -

-lr.lr_after -

-A list of all productions that can legally appear immediately to the right of the -dot (.). This list contains Production instances. This attribute -represents all of the possible branches a parse can take from the current position. -For example, suppose that lr represents a stage immediately before -an expression like this: - -
->>> lr
-LRItem(statement -> ID = . expr)
->>>
-
- -Then, the value of lr.lr_after might look like this, showing all productions that -can legally appear next: - -
->>> lr.lr_after
-[Production(expr -> expr PLUS expr), 
- Production(expr -> expr MINUS expr), 
- Production(expr -> expr TIMES expr), 
- Production(expr -> expr DIVIDE expr), 
- Production(expr -> MINUS expr), 
- Production(expr -> LPAREN expr RPAREN), 
- Production(expr -> NUMBER), 
- Production(expr -> ID)]
->>>
-
- -
- -

-lr.lr_before -

-The grammar symbol that appears immediately before the dot (.) or None if -at the beginning of the parse. -
- -

-lr.lr_next -

-A link to the next LR item, representing the next stage of the parse. None if lr -is the last LR item. -
- -LRItem instances also support the __len__() and __getitem__() special methods. -len(lr) returns the number of items in lr.prod including the dot (.). lr[n] -returns lr.prod[n]. - -

-It goes without saying that all of the attributes associated with LR -items should be assumed to be read-only. Modifications will very -likely create a small black-hole that will consume you and your code. - -

5. LRTable

- - -The LRTable class is used to represent LR parsing table data. This -minimally includes the production list, action table, and goto table. - -

-LRTable() -

-Create an empty LRTable object. This object contains only the information needed to -run an LR parser. -
- -An instance lrtab of LRTable has the following methods: - -

-lrtab.read_table(module) -

-Populates the LR table with information from the module specified in module. -module is either a module object already loaded with import or -the name of a Python module. If it's a string containing a module name, it is -loaded and parsing data is extracted. Returns the signature value that was used -when initially writing the tables. Raises a VersionError exception if -the module was created using an incompatible version of PLY. -
- -

-lrtab.bind_callables(dict) -

-This binds all of the function names used in productions to callable objects -found in the dictionary dict. During table generation and when reading -LR tables from files, PLY only uses the names of action functions such as 'p_expr', -'p_statement', etc. In order to actually run the parser, these names -have to be bound to callable objects. This method is always called prior to -running a parser. -
- -After lrtab has been populated, the following attributes are defined. - -

-lrtab.lr_method -

-The LR parsing method used (e.g., 'LALR') -
- - -

-lrtab.lr_productions -

-The production list. If the parsing tables have been newly -constructed, this will be a list of Production instances. If -the parsing tables have been read from a file, it's a list -of MiniProduction instances. This, together -with lr_action and lr_goto contain all of the -information needed by the LR parsing engine. -
- -

-lrtab.lr_action -

-The LR action dictionary that implements the underlying state machine. -The keys of this dictionary are the LR states. -
- -

-lrtab.lr_goto -

-The LR goto table that contains information about grammar rule reductions. -
- - -

6. LRGeneratedTable

- - -The LRGeneratedTable class represents constructed LR parsing tables on a -grammar. It is a subclass of LRTable. - -

-LRGeneratedTable(grammar, method='LALR',log=None) -

-Create the LR parsing tables on a grammar. grammar is an instance of Grammar, -method is a string with the parsing method ('SLR' or 'LALR'), and -log is a logger object used to write debugging information. The debugging information -written to log is the same as what appears in the parser.out file created -by yacc. By supplying a custom logger with a different message format, it is possible to get -more information (e.g., the line number in yacc.py used for issuing each line of -output in the log). The result is an instance of LRGeneratedTable. -
- -

-An instance lr of LRGeneratedTable has the following attributes. - -

-lr.grammar -

-A link to the Grammar object used to construct the parsing tables. -
- -

-lr.lr_method -

-The LR parsing method used (e.g., 'LALR') -
- - -

-lr.lr_productions -

-A reference to grammar.Productions. This, together with lr_action and lr_goto -contain all of the information needed by the LR parsing engine. -
- -

-lr.lr_action -

-The LR action dictionary that implements the underlying state machine. The keys of this dictionary are -the LR states. -
- -

-lr.lr_goto -

-The LR goto table that contains information about grammar rule reductions. -
- -

-lr.sr_conflicts -

-A list of tuples (state,token,resolution) identifying all shift/reduce conflicts. state is the LR state -number where the conflict occurred, token is the token causing the conflict, and resolution is -a string describing the resolution taken. resolution is either 'shift' or 'reduce'. -
- -

-lr.rr_conflicts -

-A list of tuples (state,rule,rejected) identifying all reduce/reduce conflicts. state is the -LR state number where the conflict occurred, rule is the production rule that was selected -and rejected is the production rule that was rejected. Both rule and rejected are -instances of Production. They can be inspected to provide the user with more information. -
- -

-There are two public methods of LRGeneratedTable. - -

-lr.write_table(modulename,outputdir="",signature="") -

-Writes the LR parsing table information to a Python module. modulename is a string -specifying the name of a module such as "parsetab". outputdir is the name of a -directory where the module should be created. signature is a string representing a -grammar signature that's written into the output file. This can be used to detect when -the data stored in a module file is out-of-sync with the the grammar specification (and that -the tables need to be regenerated). If modulename is a string "parsetab", -this function creates a file called parsetab.py. If the module name represents a -package such as "foo.bar.parsetab", then only the last component, "parsetab" is -used. -
- - -

7. LRParser

- - -The LRParser class implements the low-level LR parsing engine. - - -

-LRParser(lrtab, error_func) -

-Create an LRParser. lrtab is an instance of LRTable -containing the LR production and state tables. error_func is the -error function to invoke in the event of a parsing error. -
- -An instance p of LRParser has the following methods: - -

-p.parse(input=None,lexer=None,debug=0,tracking=0,tokenfunc=None) -

-Run the parser. input is a string, which if supplied is fed into the -lexer using its input() method. lexer is an instance of the -Lexer class to use for tokenizing. If not supplied, the last lexer -created with the lex module is used. debug is a boolean flag -that enables debugging. tracking is a boolean flag that tells the -parser to perform additional line number tracking. tokenfunc is a callable -function that returns the next token. If supplied, the parser will use it to get -all tokens. -
- -

-p.restart() -

-Resets the parser state for a parse already in progress. -
- -

8. ParserReflect

- - -

-The ParserReflect class is used to collect parser specification data -from a Python module or object. This class is what collects all of the -p_rule() functions in a PLY file, performs basic error checking, -and collects all of the needed information to build a grammar. Most of the -high-level PLY interface as used by the yacc() function is actually -implemented by this class. - -

-ParserReflect(pdict, log=None) -

-Creates a ParserReflect instance. pdict is a dictionary -containing parser specification data. This dictionary typically corresponds -to the module or class dictionary of code that implements a PLY parser. -log is a logger instance that will be used to report error -messages. -
- -An instance p of ParserReflect has the following methods: - -

-p.get_all() -

-Collect and store all required parsing information. -
- -

-p.validate_all() -

-Validate all of the collected parsing information. This is a seprate step -from p.get_all() as a performance optimization. In order to -increase parser start-up time, a parser can elect to only validate the -parsing data when regenerating the parsing tables. The validation -step tries to collect as much information as possible rather than -raising an exception at the first sign of trouble. The attribute -p.error is set if there are any validation errors. The -value of this attribute is also returned. -
- -

-p.signature() -

-Compute a signature representing the contents of the collected parsing -data. The signature value should change if anything in the parser -specification has changed in a way that would justify parser table -regeneration. This method can be called after p.get_all(), -but before p.validate_all(). -
- -The following attributes are set in the process of collecting data: - -

-p.start -

-The grammar start symbol, if any. Taken from pdict['start']. -
- -

-p.error_func -

-The error handling function or None. Taken from pdict['p_error']. -
- -

-p.tokens -

-The token list. Taken from pdict['tokens']. -
- -

-p.prec -

-The precedence specifier. Taken from pdict['precedence']. -
- -

-p.preclist -

-A parsed version of the precedence specified. A list of tuples of the form -(token,assoc,level) where token is the terminal symbol, -assoc is the associativity (e.g., 'left') and level -is a numeric precedence level. -
- -

-p.grammar -

-A list of tuples (name, rules) representing the grammar rules. name is the -name of a Python function or method in pdict that starts with "p_". -rules is a list of tuples (filename,line,prodname,syms) representing -the grammar rules found in the documentation string of that function. filename and line contain location -information that can be used for debugging. prodname is the name of the -production. syms is the right-hand side of the production. If you have a -function like this - -
-def p_expr(p):
-    '''expr : expr PLUS expr
-            | expr MINUS expr
-            | expr TIMES expr
-            | expr DIVIDE expr'''
-
- -then the corresponding entry in p.grammar might look like this: - -
-('p_expr', [ ('calc.py',10,'expr', ['expr','PLUS','expr']),
-             ('calc.py',11,'expr', ['expr','MINUS','expr']),
-             ('calc.py',12,'expr', ['expr','TIMES','expr']),
-             ('calc.py',13,'expr', ['expr','DIVIDE','expr'])
-           ])
-
-
- -

-p.pfuncs -

-A sorted list of tuples (line, file, name, doc) representing all of -the p_ functions found. line and file give location -information. name is the name of the function. doc is the -documentation string. This list is sorted in ascending order by line number. -
- -

-p.files -

-A dictionary holding all of the source filenames that were encountered -while collecting parser information. Only the keys of this dictionary have -any meaning. -
- -

-p.error -

-An attribute that indicates whether or not any critical errors -occurred in validation. If this is set, it means that that some kind -of problem was detected and that no further processing should be -performed. -
- - -

9. High-level operation

- - -Using all of the above classes requires some attention to detail. The yacc() -function carries out a very specific sequence of operations to create a grammar. -This same sequence should be emulated if you build an alternative PLY interface. - -
    -
  1. A ParserReflect object is created and raw grammar specification data is -collected. -
  2. A Grammar object is created and populated with information -from the specification data. -
  3. A LRGenerator object is created to run the LALR algorithm over -the Grammar object. -
  4. Productions in the LRGenerator and bound to callables using the bind_callables() -method. -
  5. A LRParser object is created from from the information in the -LRGenerator object. -
- - - - - - - - - - diff --git a/doc/makedoc.py b/doc/makedoc.py deleted file mode 100644 index e5cbdb0..0000000 --- a/doc/makedoc.py +++ /dev/null @@ -1,194 +0,0 @@ -#!/usr/local/bin/python - -############################################################################### -# Takes a chapter as input and adds internal links and numbering to all -# of the H1, H2, H3, H4 and H5 sections. -# -# Every heading HTML tag (H1, H2 etc) is given an autogenerated name to link -# to. However, if the name is not an autogenerated name from a previous run, -# it will be kept. If it is autogenerated, it might change on subsequent runs -# of this program. Thus if you want to create links to one of the headings, -# then change the heading link name to something that does not look like an -# autogenerated link name. -############################################################################### - -import sys -import re -import string - -############################################################################### -# Functions -############################################################################### - -# Regexs for -alink = re.compile(r"", re.IGNORECASE) -heading = re.compile(r"(_nn\d)", re.IGNORECASE) - -def getheadingname(m): - autogeneratedheading = True - if m.group(1) != None: - amatch = alink.match(m.group(1)) - if amatch: - # A non-autogenerated heading - keep it - headingname = amatch.group(1) - autogeneratedheading = heading.match(headingname) - if autogeneratedheading: - # The heading name was either non-existent or autogenerated, - # We can create a new heading / change the existing heading - headingname = "%s_nn%d" % (filenamebase, nameindex) - return headingname - -############################################################################### -# Main program -############################################################################### - -if len(sys.argv) != 2: - print "usage: makedoc.py filename" - sys.exit(1) - -filename = sys.argv[1] -filenamebase = string.split(filename,".")[0] - -section = 0 -subsection = 0 -subsubsection = 0 -subsubsubsection = 0 -nameindex = 0 - -name = "" - -# Regexs for

,...

sections - -h1 = re.compile(r".*?

()*[\d\.\s]*(.*?)

", re.IGNORECASE) -h2 = re.compile(r".*?

()*[\d\.\s]*(.*?)

", re.IGNORECASE) -h3 = re.compile(r".*?

()*[\d\.\s]*(.*?)

", re.IGNORECASE) -h4 = re.compile(r".*?

()*[\d\.\s]*(.*?)

", re.IGNORECASE) -h5 = re.compile(r".*?
()*[\d\.\s]*(.*?)
", re.IGNORECASE) - -# Make backup -with open(filename) as src, open(filename+".bak","w") as dst: - dst.write(src.read()) - -lines = data.splitlines() -result = [ ] # This is the result of postprocessing the file -index = "\n
\n" # index contains the index for adding at the top of the file. Also printed to stdout. - -skip = 0 -skipspace = 0 - -for s in lines: - if s == "": - if not skip: - result.append("@INDEX@") - skip = 1 - else: - skip = 0 - continue - if skip: - continue - - if not s and skipspace: - continue - - if skipspace: - result.append("") - result.append("") - skipspace = 0 - - m = h2.match(s) - if m: - prevheadingtext = m.group(2) - nameindex += 1 - section += 1 - headingname = getheadingname(m) - result.append("""

%d. %s

""" % (headingname,section, prevheadingtext)) - - if subsubsubsection: - index += "\n" - if subsubsection: - index += "\n" - if subsection: - index += "\n" - if section == 1: - index += "\n" - if subsubsection: - index += "\n" - if subsection == 1: - index += "\n" - if subsubsection == 1: - index += "\n" - -if subsection: - index += "\n" - -if section: - index += "\n" - -index += "
\n\n" - -data = "\n".join(result) - -data = data.replace("@INDEX@",index) + "\n" - -# Write the file back out -with open(filename,"w") as f: - f.write(data) diff --git a/doc/ply.html b/doc/ply.html deleted file mode 100644 index 6b8aca9..0000000 --- a/doc/ply.html +++ /dev/null @@ -1,3496 +0,0 @@ - - -PLY (Python Lex-Yacc) - - - -

PLY (Python Lex-Yacc)

- - -David M. Beazley
-dave@dabeaz.com
-
- -

-PLY Version: 3.11 -

- - -

- -
- - - - - - - -

1. Preface and Requirements

- - -

-This document provides an overview of lexing and parsing with PLY. -Given the intrinsic complexity of parsing, I would strongly advise -that you read (or at least skim) this entire document before jumping -into a big development project with PLY. -

- -

-PLY-3.5 is compatible with both Python 2 and Python 3. If you are using -Python 2, you have to use Python 2.6 or newer. -

- -

2. Introduction

- - -PLY is a pure-Python implementation of the popular compiler -construction tools lex and yacc. The main goal of PLY is to stay -fairly faithful to the way in which traditional lex/yacc tools work. -This includes supporting LALR(1) parsing as well as providing -extensive input validation, error reporting, and diagnostics. Thus, -if you've used yacc in another programming language, it should be -relatively straightforward to use PLY. - -

-Early versions of PLY were developed to support an Introduction to -Compilers Course I taught in 2001 at the University of Chicago. -Since PLY was primarily developed as an instructional tool, you will -find it to be fairly picky about token and grammar rule -specification. In part, this -added formality is meant to catch common programming mistakes made by -novice users. However, advanced users will also find such features to -be useful when building complicated grammars for real programming -languages. It should also be noted that PLY does not provide much in -the way of bells and whistles (e.g., automatic construction of -abstract syntax trees, tree traversal, etc.). Nor would I consider it -to be a parsing framework. Instead, you will find a bare-bones, yet -fully capable lex/yacc implementation written entirely in Python. - -

-The rest of this document assumes that you are somewhat familiar with -parsing theory, syntax directed translation, and the use of compiler -construction tools such as lex and yacc in other programming -languages. If you are unfamiliar with these topics, you will probably -want to consult an introductory text such as "Compilers: Principles, -Techniques, and Tools", by Aho, Sethi, and Ullman. O'Reilly's "Lex -and Yacc" by John Levine may also be handy. In fact, the O'Reilly book can be -used as a reference for PLY as the concepts are virtually identical. - -

3. PLY Overview

- - -

-PLY consists of two separate modules; lex.py and -yacc.py, both of which are found in a Python package -called ply. The lex.py module is used to break input text into a -collection of tokens specified by a collection of regular expression -rules. yacc.py is used to recognize language syntax that has -been specified in the form of a context free grammar. -

- -

-The two tools are meant to work together. Specifically, -lex.py provides an external interface in the form of a -token() function that returns the next valid token on the -input stream. yacc.py calls this repeatedly to retrieve -tokens and invoke grammar rules. The output of yacc.py is -often an Abstract Syntax Tree (AST). However, this is entirely up to -the user. If desired, yacc.py can also be used to implement -simple one-pass compilers. - -

-Like its Unix counterpart, yacc.py provides most of the -features you expect including extensive error checking, grammar -validation, support for empty productions, error tokens, and ambiguity -resolution via precedence rules. In fact, almost everything that is possible in traditional yacc -should be supported in PLY. - -

-The primary difference between -yacc.py and Unix yacc is that yacc.py -doesn't involve a separate code-generation process. -Instead, PLY relies on reflection (introspection) -to build its lexers and parsers. Unlike traditional lex/yacc which -require a special input file that is converted into a separate source -file, the specifications given to PLY are valid Python -programs. This means that there are no extra source files nor is -there a special compiler construction step (e.g., running yacc to -generate Python code for the compiler). Since the generation of the -parsing tables is relatively expensive, PLY caches the results and -saves them to a file. If no changes are detected in the input source, -the tables are read from the cache. Otherwise, they are regenerated. - -

4. Lex

- - -lex.py is used to tokenize an input string. For example, suppose -you're writing a programming language and a user supplied the following input string: - -
-
-x = 3 + 42 * (s - t)
-
-
- -A tokenizer splits the string into individual tokens - -
-
-'x','=', '3', '+', '42', '*', '(', 's', '-', 't', ')'
-
-
- -Tokens are usually given names to indicate what they are. For example: - -
-
-'ID','EQUALS','NUMBER','PLUS','NUMBER','TIMES',
-'LPAREN','ID','MINUS','ID','RPAREN'
-
-
- -More specifically, the input is broken into pairs of token types and values. For example: - -
-
-('ID','x'), ('EQUALS','='), ('NUMBER','3'), 
-('PLUS','+'), ('NUMBER','42), ('TIMES','*'),
-('LPAREN','('), ('ID','s'), ('MINUS','-'),
-('ID','t'), ('RPAREN',')'
-
-
- -The identification of tokens is typically done by writing a series of regular expression -rules. The next section shows how this is done using lex.py. - -

4.1 Lex Example

- - -The following example shows how lex.py is used to write a simple tokenizer. - -
-
-# ------------------------------------------------------------
-# calclex.py
-#
-# tokenizer for a simple expression evaluator for
-# numbers and +,-,*,/
-# ------------------------------------------------------------
-import ply.lex as lex
-
-# List of token names.   This is always required
-tokens = (
-   'NUMBER',
-   'PLUS',
-   'MINUS',
-   'TIMES',
-   'DIVIDE',
-   'LPAREN',
-   'RPAREN',
-)
-
-# Regular expression rules for simple tokens
-t_PLUS    = r'\+'
-t_MINUS   = r'-'
-t_TIMES   = r'\*'
-t_DIVIDE  = r'/'
-t_LPAREN  = r'\('
-t_RPAREN  = r'\)'
-
-# A regular expression rule with some action code
-def t_NUMBER(t):
-    r'\d+'
-    t.value = int(t.value)    
-    return t
-
-# Define a rule so we can track line numbers
-def t_newline(t):
-    r'\n+'
-    t.lexer.lineno += len(t.value)
-
-# A string containing ignored characters (spaces and tabs)
-t_ignore  = ' \t'
-
-# Error handling rule
-def t_error(t):
-    print("Illegal character '%s'" % t.value[0])
-    t.lexer.skip(1)
-
-# Build the lexer
-lexer = lex.lex()
-
-
-
-To use the lexer, you first need to feed it some input text using -its input() method. After that, repeated calls -to token() produce tokens. The following code shows how this -works: - -
-
-
-# Test it out
-data = '''
-3 + 4 * 10
-  + -20 *2
-'''
-
-# Give the lexer some input
-lexer.input(data)
-
-# Tokenize
-while True:
-    tok = lexer.token()
-    if not tok: 
-        break      # No more input
-    print(tok)
-
-
- -When executed, the example will produce the following output: - -
-
-$ python example.py
-LexToken(NUMBER,3,2,1)
-LexToken(PLUS,'+',2,3)
-LexToken(NUMBER,4,2,5)
-LexToken(TIMES,'*',2,7)
-LexToken(NUMBER,10,2,10)
-LexToken(PLUS,'+',3,14)
-LexToken(MINUS,'-',3,16)
-LexToken(NUMBER,20,3,18)
-LexToken(TIMES,'*',3,20)
-LexToken(NUMBER,2,3,21)
-
-
- -Lexers also support the iteration protocol. So, you can write the above loop as follows: - -
-
-for tok in lexer:
-    print(tok)
-
-
- -The tokens returned by lexer.token() are instances -of LexToken. This object has -attributes tok.type, tok.value, -tok.lineno, and tok.lexpos. The following code shows an example of -accessing these attributes: - -
-
-# Tokenize
-while True:
-    tok = lexer.token()
-    if not tok: 
-        break      # No more input
-    print(tok.type, tok.value, tok.lineno, tok.lexpos)
-
-
- -The tok.type and tok.value attributes contain the -type and value of the token itself. -tok.lineno and tok.lexpos contain information about -the location of the token. tok.lexpos is the index of the -token relative to the start of the input text. - -

4.2 The tokens list

- - -

-All lexers must provide a list tokens that defines all of the possible token -names that can be produced by the lexer. This list is always required -and is used to perform a variety of validation checks. The tokens list is also used by the -yacc.py module to identify terminals. -

- -

-In the example, the following code specified the token names: - -

-
-tokens = (
-   'NUMBER',
-   'PLUS',
-   'MINUS',
-   'TIMES',
-   'DIVIDE',
-   'LPAREN',
-   'RPAREN',
-)
-
-
- -

4.3 Specification of tokens

- - -Each token is specified by writing a regular expression rule compatible with Python's re module. Each of these rules -are defined by making declarations with a special prefix t_ to indicate that it -defines a token. For simple tokens, the regular expression can -be specified as strings such as this (note: Python raw strings are used since they are the -most convenient way to write regular expression strings): - -
-
-t_PLUS = r'\+'
-
-
- -In this case, the name following the t_ must exactly match one of the -names supplied in tokens. If some kind of action needs to be performed, -a token rule can be specified as a function. For example, this rule matches numbers and -converts the string into a Python integer. - -
-
-def t_NUMBER(t):
-    r'\d+'
-    t.value = int(t.value)
-    return t
-
-
- -When a function is used, the regular expression rule is specified in the function documentation string. -The function always takes a single argument which is an instance of -LexToken. This object has attributes of t.type which is the token type (as a string), -t.value which is the lexeme (the actual text matched), t.lineno which is the current line number, and t.lexpos which -is the position of the token relative to the beginning of the input text. -By default, t.type is set to the name following the t_ prefix. The action -function can modify the contents of the LexToken object as appropriate. However, -when it is done, the resulting token should be returned. If no value is returned by the action -function, the token is simply discarded and the next token read. - -

-Internally, lex.py uses the re module to do its pattern matching. Patterns are compiled -using the re.VERBOSE flag which can be used to help readability. However, be aware that unescaped -whitespace is ignored and comments are allowed in this mode. If your pattern involves whitespace, make sure you -use \s. If you need to match the # character, use [#]. -

- -

-When building the master regular expression, -rules are added in the following order: -

- -

-

    -
  1. All tokens defined by functions are added in the same order as they appear in the lexer file. -
  2. Tokens defined by strings are added next by sorting them in order of decreasing regular expression length (longer expressions -are added first). -
-

-Without this ordering, it can be difficult to correctly match certain types of tokens. For example, if you -wanted to have separate tokens for "=" and "==", you need to make sure that "==" is checked first. By sorting regular -expressions in order of decreasing length, this problem is solved for rules defined as strings. For functions, -the order can be explicitly controlled since rules appearing first are checked first. - -

-To handle reserved words, you should write a single rule to match an -identifier and do a special name lookup in a function like this: - -

-
-reserved = {
-   'if' : 'IF',
-   'then' : 'THEN',
-   'else' : 'ELSE',
-   'while' : 'WHILE',
-   ...
-}
-
-tokens = ['LPAREN','RPAREN',...,'ID'] + list(reserved.values())
-
-def t_ID(t):
-    r'[a-zA-Z_][a-zA-Z_0-9]*'
-    t.type = reserved.get(t.value,'ID')    # Check for reserved words
-    return t
-
-
- -This approach greatly reduces the number of regular expression rules and is likely to make things a little faster. - -

-Note: You should avoid writing individual rules for reserved words. For example, if you write rules like this, - -

-
-t_FOR   = r'for'
-t_PRINT = r'print'
-
-
- -those rules will be triggered for identifiers that include those words as a prefix such as "forget" or "printed". This is probably not -what you want. - -

4.4 Token values

- - -When tokens are returned by lex, they have a value that is stored in the value attribute. Normally, the value is the text -that was matched. However, the value can be assigned to any Python object. For instance, when lexing identifiers, you may -want to return both the identifier name and information from some sort of symbol table. To do this, you might write a rule like this: - -
-
-def t_ID(t):
-    ...
-    # Look up symbol table information and return a tuple
-    t.value = (t.value, symbol_lookup(t.value))
-    ...
-    return t
-
-
- -It is important to note that storing data in other attribute names is not recommended. The yacc.py module only exposes the -contents of the value attribute. Thus, accessing other attributes may be unnecessarily awkward. If you -need to store multiple values on a token, assign a tuple, dictionary, or instance to value. - -

4.5 Discarded tokens

- - -To discard a token, such as a comment, simply define a token rule that returns no value. For example: - -
-
-def t_COMMENT(t):
-    r'\#.*'
-    pass
-    # No return value. Token discarded
-
-
- -Alternatively, you can include the prefix "ignore_" in the token declaration to force a token to be ignored. For example: - -
-
-t_ignore_COMMENT = r'\#.*'
-
-
- -Be advised that if you are ignoring many different kinds of text, you may still want to use functions since these provide more precise -control over the order in which regular expressions are matched (i.e., functions are matched in order of specification whereas strings are -sorted by regular expression length). - -

4.6 Line numbers and positional information

- - -

By default, lex.py knows nothing about line numbers. This is because lex.py doesn't know anything -about what constitutes a "line" of input (e.g., the newline character or even if the input is textual data). -To update this information, you need to write a special rule. In the example, the t_newline() rule shows how to do this. - -

-
-# Define a rule so we can track line numbers
-def t_newline(t):
-    r'\n+'
-    t.lexer.lineno += len(t.value)
-
-
-Within the rule, the lineno attribute of the underlying lexer t.lexer is updated. -After the line number is updated, the token is simply discarded since nothing is returned. - -

-lex.py does not perform any kind of automatic column tracking. However, it does record positional -information related to each token in the lexpos attribute. Using this, it is usually possible to compute -column information as a separate step. For instance, just count backwards until you reach a newline. - -

-
-# Compute column.
-#     input is the input text string
-#     token is a token instance
-def find_column(input, token):
-    line_start = input.rfind('\n', 0, token.lexpos) + 1
-    return (token.lexpos - line_start) + 1
-
-
- -Since column information is often only useful in the context of error handling, calculating the column -position can be performed when needed as opposed to doing it for each token. - -

4.7 Ignored characters

- - -

-The special t_ignore rule is reserved by lex.py for characters -that should be completely ignored in the input stream. -Usually this is used to skip over whitespace and other non-essential characters. -Although it is possible to define a regular expression rule for whitespace in a manner -similar to t_newline(), the use of t_ignore provides substantially better -lexing performance because it is handled as a special case and is checked in a much -more efficient manner than the normal regular expression rules. -

- -

-The characters given in t_ignore are not ignored when such characters are part of -other regular expression patterns. For example, if you had a rule to capture quoted text, -that pattern can include the ignored characters (which will be captured in the normal way). The -main purpose of t_ignore is to ignore whitespace and other padding between the -tokens that you actually want to parse. -

- -

4.8 Literal characters

- - -

-Literal characters can be specified by defining a variable literals in your lexing module. For example: - -

-
-literals = [ '+','-','*','/' ]
-
-
- -or alternatively - -
-
-literals = "+-*/"
-
-
- -A literal character is simply a single character that is returned "as is" when encountered by the lexer. Literals are checked -after all of the defined regular expression rules. Thus, if a rule starts with one of the literal characters, it will always -take precedence. - -

-When a literal token is returned, both its type and value attributes are set to the character itself. For example, '+'. -

- -

-It's possible to write token functions that perform additional actions -when literals are matched. However, you'll need to set the token type -appropriately. For example: -

- -
-
-literals = [ '{', '}' ]
-
-def t_lbrace(t):
-    r'\{'
-    t.type = '{'      # Set token type to the expected literal
-    return t
-
-def t_rbrace(t):
-    r'\}'
-    t.type = '}'      # Set token type to the expected literal
-    return t
-
-
- -

4.9 Error handling

- - -

-The t_error() -function is used to handle lexing errors that occur when illegal -characters are detected. In this case, the t.value attribute contains the -rest of the input string that has not been tokenized. In the example, the error function -was defined as follows: - -

-
-# Error handling rule
-def t_error(t):
-    print("Illegal character '%s'" % t.value[0])
-    t.lexer.skip(1)
-
-
- -In this case, we simply print the offending character and skip ahead one character by calling t.lexer.skip(1). - -

4.10 EOF Handling

- - -

-The t_eof() function is used to handle an end-of-file (EOF) condition in the input. As input, it -receives a token type 'eof' with the lineno and lexpos attributes set appropriately. -The main use of this function is provide more input to the lexer so that it can continue to parse. Here is an -example of how this works: -

- -
-
-# EOF handling rule
-def t_eof(t):
-    # Get more input (Example)
-    more = raw_input('... ')
-    if more:
-        self.lexer.input(more)
-        return self.lexer.token()
-    return None
-
-
- -

-The EOF function should return the next available token (by calling self.lexer.token()) or None to -indicate no more data. Be aware that setting more input with the self.lexer.input() method does -NOT reset the lexer state or the lineno attribute used for position tracking. The lexpos -attribute is reset so be aware of that if you're using it in error reporting. -

- -

4.11 Building and using the lexer

- - -

-To build the lexer, the function lex.lex() is used. For example:

- -
-
-lexer = lex.lex()
-
-
- -

This function -uses Python reflection (or introspection) to read the regular expression rules -out of the calling context and build the lexer. Once the lexer has been built, two methods can -be used to control the lexer. -

- - -

4.12 The @TOKEN decorator

- - -In some applications, you may want to define build tokens from as a series of -more complex regular expression rules. For example: - -
-
-digit            = r'([0-9])'
-nondigit         = r'([_A-Za-z])'
-identifier       = r'(' + nondigit + r'(' + digit + r'|' + nondigit + r')*)'        
-
-def t_ID(t):
-    # want docstring to be identifier above. ?????
-    ...
-
-
- -In this case, we want the regular expression rule for ID to be one of the variables above. However, there is no -way to directly specify this using a normal documentation string. To solve this problem, you can use the @TOKEN -decorator. For example: - -
-
-from ply.lex import TOKEN
-
-@TOKEN(identifier)
-def t_ID(t):
-    ...
-
-
- -

-This will attach identifier to the docstring for t_ID() allowing lex.py to work normally. -

- -

4.13 Optimized mode

- - -For improved performance, it may be desirable to use Python's -optimized mode (e.g., running Python with the -O -option). However, doing so causes Python to ignore documentation -strings. This presents special problems for lex.py. To -handle this case, you can create your lexer using -the optimize option as follows: - -
-
-lexer = lex.lex(optimize=1)
-
-
- -Next, run Python in its normal operating mode. When you do -this, lex.py will write a file called lextab.py in -the same directory as the module containing the lexer specification. -This file contains all of the regular -expression rules and tables used during lexing. On subsequent -executions, -lextab.py will simply be imported to build the lexer. This -approach substantially improves the startup time of the lexer and it -works in Python's optimized mode. - -

-To change the name of the lexer-generated module, use the lextab keyword argument. For example: -

- -
-
-lexer = lex.lex(optimize=1,lextab="footab")
-
-
- -When running in optimized mode, it is important to note that lex disables most error checking. Thus, this is really only recommended -if you're sure everything is working correctly and you're ready to start releasing production code. - -

4.14 Debugging

- - -For the purpose of debugging, you can run lex() in a debugging mode as follows: - -
-
-lexer = lex.lex(debug=1)
-
-
- -

-This will produce various sorts of debugging information including all of the added rules, -the master regular expressions used by the lexer, and tokens generating during lexing. -

- -

-In addition, lex.py comes with a simple main function which -will either tokenize input read from standard input or from a file specified -on the command line. To use it, simply put this in your lexer: -

- -
-
-if __name__ == '__main__':
-     lex.runmain()
-
-
- -Please refer to the "Debugging" section near the end for some more advanced details -of debugging. - -

4.15 Alternative specification of lexers

- - -As shown in the example, lexers are specified all within one Python module. If you want to -put token rules in a different module from the one in which you invoke lex(), use the -module keyword argument. - -

-For example, you might have a dedicated module that just contains -the token rules: - -

-
-# module: tokrules.py
-# This module just contains the lexing rules
-
-# List of token names.   This is always required
-tokens = (
-   'NUMBER',
-   'PLUS',
-   'MINUS',
-   'TIMES',
-   'DIVIDE',
-   'LPAREN',
-   'RPAREN',
-)
-
-# Regular expression rules for simple tokens
-t_PLUS    = r'\+'
-t_MINUS   = r'-'
-t_TIMES   = r'\*'
-t_DIVIDE  = r'/'
-t_LPAREN  = r'\('
-t_RPAREN  = r'\)'
-
-# A regular expression rule with some action code
-def t_NUMBER(t):
-    r'\d+'
-    t.value = int(t.value)    
-    return t
-
-# Define a rule so we can track line numbers
-def t_newline(t):
-    r'\n+'
-    t.lexer.lineno += len(t.value)
-
-# A string containing ignored characters (spaces and tabs)
-t_ignore  = ' \t'
-
-# Error handling rule
-def t_error(t):
-    print("Illegal character '%s'" % t.value[0])
-    t.lexer.skip(1)
-
-
- -Now, if you wanted to build a tokenizer from these rules from within a different module, you would do the following (shown for Python interactive mode): - -
-
->>> import tokrules
->>> lexer = lex.lex(module=tokrules)
->>> lexer.input("3 + 4")
->>> lexer.token()
-LexToken(NUMBER,3,1,1,0)
->>> lexer.token()
-LexToken(PLUS,'+',1,2)
->>> lexer.token()
-LexToken(NUMBER,4,1,4)
->>> lexer.token()
-None
->>>
-
-
- -The module option can also be used to define lexers from instances of a class. For example: - -
-
-import ply.lex as lex
-
-class MyLexer(object):
-    # List of token names.   This is always required
-    tokens = (
-       'NUMBER',
-       'PLUS',
-       'MINUS',
-       'TIMES',
-       'DIVIDE',
-       'LPAREN',
-       'RPAREN',
-    )
-
-    # Regular expression rules for simple tokens
-    t_PLUS    = r'\+'
-    t_MINUS   = r'-'
-    t_TIMES   = r'\*'
-    t_DIVIDE  = r'/'
-    t_LPAREN  = r'\('
-    t_RPAREN  = r'\)'
-
-    # A regular expression rule with some action code
-    # Note addition of self parameter since we're in a class
-    def t_NUMBER(self,t):
-        r'\d+'
-        t.value = int(t.value)    
-        return t
-
-    # Define a rule so we can track line numbers
-    def t_newline(self,t):
-        r'\n+'
-        t.lexer.lineno += len(t.value)
-
-    # A string containing ignored characters (spaces and tabs)
-    t_ignore  = ' \t'
-
-    # Error handling rule
-    def t_error(self,t):
-        print("Illegal character '%s'" % t.value[0])
-        t.lexer.skip(1)
-
-    # Build the lexer
-    def build(self,**kwargs):
-        self.lexer = lex.lex(module=self, **kwargs)
-    
-    # Test it output
-    def test(self,data):
-        self.lexer.input(data)
-        while True:
-             tok = self.lexer.token()
-             if not tok: 
-                 break
-             print(tok)
-
-# Build the lexer and try it out
-m = MyLexer()
-m.build()           # Build the lexer
-m.test("3 + 4")     # Test it
-
-
- - -When building a lexer from class, you should construct the lexer from -an instance of the class, not the class object itself. This is because -PLY only works properly if the lexer actions are defined by bound-methods. - -

-When using the module option to lex(), PLY collects symbols -from the underlying object using the dir() function. There is no -direct access to the __dict__ attribute of the object supplied as a -module value.

- -

-Finally, if you want to keep things nicely encapsulated, but don't want to use a -full-fledged class definition, lexers can be defined using closures. For example: - -

-
-import ply.lex as lex
-
-# List of token names.   This is always required
-tokens = (
-  'NUMBER',
-  'PLUS',
-  'MINUS',
-  'TIMES',
-  'DIVIDE',
-  'LPAREN',
-  'RPAREN',
-)
-
-def MyLexer():
-    # Regular expression rules for simple tokens
-    t_PLUS    = r'\+'
-    t_MINUS   = r'-'
-    t_TIMES   = r'\*'
-    t_DIVIDE  = r'/'
-    t_LPAREN  = r'\('
-    t_RPAREN  = r'\)'
-
-    # A regular expression rule with some action code
-    def t_NUMBER(t):
-        r'\d+'
-        t.value = int(t.value)    
-        return t
-
-    # Define a rule so we can track line numbers
-    def t_newline(t):
-        r'\n+'
-        t.lexer.lineno += len(t.value)
-
-    # A string containing ignored characters (spaces and tabs)
-    t_ignore  = ' \t'
-
-    # Error handling rule
-    def t_error(t):
-        print("Illegal character '%s'" % t.value[0])
-        t.lexer.skip(1)
-
-    # Build the lexer from my environment and return it    
-    return lex.lex()
-
-
- -

-Important note: If you are defining a lexer using a class or closure, be aware that PLY still requires you to only -define a single lexer per module (source file). There are extensive validation/error checking parts of the PLY that -may falsely report error messages if you don't follow this rule. -

- -

4.16 Maintaining state

- - -In your lexer, you may want to maintain a variety of state -information. This might include mode settings, symbol tables, and -other details. As an example, suppose that you wanted to keep -track of how many NUMBER tokens had been encountered. - -

-One way to do this is to keep a set of global variables in the module -where you created the lexer. For example: - -

-
-num_count = 0
-def t_NUMBER(t):
-    r'\d+'
-    global num_count
-    num_count += 1
-    t.value = int(t.value)    
-    return t
-
-
- -If you don't like the use of a global variable, another place to store -information is inside the Lexer object created by lex(). -To this, you can use the lexer attribute of tokens passed to -the various rules. For example: - -
-
-def t_NUMBER(t):
-    r'\d+'
-    t.lexer.num_count += 1     # Note use of lexer attribute
-    t.value = int(t.value)    
-    return t
-
-lexer = lex.lex()
-lexer.num_count = 0            # Set the initial count
-
-
- -This latter approach has the advantage of being simple and working -correctly in applications where multiple instantiations of a given -lexer exist in the same application. However, this might also feel -like a gross violation of encapsulation to OO purists. -Just to put your mind at some ease, all -internal attributes of the lexer (with the exception of lineno) have names that are prefixed -by lex (e.g., lexdata,lexpos, etc.). Thus, -it is perfectly safe to store attributes in the lexer that -don't have names starting with that prefix or a name that conflicts with one of the -predefined methods (e.g., input(), token(), etc.). - -

-If you don't like assigning values on the lexer object, you can define your lexer as a class as -shown in the previous section: - -

-
-class MyLexer:
-    ...
-    def t_NUMBER(self,t):
-        r'\d+'
-        self.num_count += 1
-        t.value = int(t.value)    
-        return t
-
-    def build(self, **kwargs):
-        self.lexer = lex.lex(object=self,**kwargs)
-
-    def __init__(self):
-        self.num_count = 0
-
-
- -The class approach may be the easiest to manage if your application is -going to be creating multiple instances of the same lexer and you need -to manage a lot of state. - -

-State can also be managed through closures. For example, in Python 3: - -

-
-def MyLexer():
-    num_count = 0
-    ...
-    def t_NUMBER(t):
-        r'\d+'
-        nonlocal num_count
-        num_count += 1
-        t.value = int(t.value)    
-        return t
-    ...
-
-
- -

4.17 Lexer cloning

- - -

-If necessary, a lexer object can be duplicated by invoking its clone() method. For example: - -

-
-lexer = lex.lex()
-...
-newlexer = lexer.clone()
-
-
- -When a lexer is cloned, the copy is exactly identical to the original lexer -including any input text and internal state. However, the clone allows a -different set of input text to be supplied which may be processed separately. -This may be useful in situations when you are writing a parser/compiler that -involves recursive or reentrant processing. For instance, if you -needed to scan ahead in the input for some reason, you could create a -clone and use it to look ahead. Or, if you were implementing some kind of preprocessor, -cloned lexers could be used to handle different input files. - -

-Creating a clone is different than calling lex.lex() in that -PLY doesn't regenerate any of the internal tables or regular expressions. - -

-Special considerations need to be made when cloning lexers that also -maintain their own internal state using classes or closures. Namely, -you need to be aware that the newly created lexers will share all of -this state with the original lexer. For example, if you defined a -lexer as a class and did this: - -

-
-m = MyLexer()
-a = lex.lex(object=m)      # Create a lexer
-
-b = a.clone()              # Clone the lexer
-
-
- -Then both a and b are going to be bound to the same -object m and any changes to m will be reflected in both lexers. It's -important to emphasize that clone() is only meant to create a new lexer -that reuses the regular expressions and environment of another lexer. If you -need to make a totally new copy of a lexer, then call lex() again. - -

4.18 Internal lexer state

- - -A Lexer object lexer has a number of internal attributes that may be useful in certain -situations. - -

-lexer.lexpos -

-This attribute is an integer that contains the current position within the input text. If you modify -the value, it will change the result of the next call to token(). Within token rule functions, this points -to the first character after the matched text. If the value is modified within a rule, the next returned token will be -matched at the new position. -
- -

-lexer.lineno -

-The current value of the line number attribute stored in the lexer. PLY only specifies that the attribute -exists---it never sets, updates, or performs any processing with it. If you want to track line numbers, -you will need to add code yourself (see the section on line numbers and positional information). -
- -

-lexer.lexdata -

-The current input text stored in the lexer. This is the string passed with the input() method. It -would probably be a bad idea to modify this unless you really know what you're doing. -
- -

-lexer.lexmatch -

-This is the raw Match object returned by the Python re.match() function (used internally by PLY) for the -current token. If you have written a regular expression that contains named groups, you can use this to retrieve those values. -Note: This attribute is only updated when tokens are defined and processed by functions. -
- -

4.19 Conditional lexing and start conditions

- - -In advanced parsing applications, it may be useful to have different -lexing states. For instance, you may want the occurrence of a certain -token or syntactic construct to trigger a different kind of lexing. -PLY supports a feature that allows the underlying lexer to be put into -a series of different states. Each state can have its own tokens, -lexing rules, and so forth. The implementation is based largely on -the "start condition" feature of GNU flex. Details of this can be found -at http://flex.sourceforge.net/manual/Start-Conditions.html. - -

-To define a new lexing state, it must first be declared. This is done by including a "states" declaration in your -lex file. For example: - -

-
-states = (
-   ('foo','exclusive'),
-   ('bar','inclusive'),
-)
-
-
- -This declaration declares two states, 'foo' -and 'bar'. States may be of two types; 'exclusive' -and 'inclusive'. An exclusive state completely overrides the -default behavior of the lexer. That is, lex will only return tokens -and apply rules defined specifically for that state. An inclusive -state adds additional tokens and rules to the default set of rules. -Thus, lex will return both the tokens defined by default in addition -to those defined for the inclusive state. - -

-Once a state has been declared, tokens and rules are declared by including the -state name in token/rule declaration. For example: - -

-
-t_foo_NUMBER = r'\d+'                      # Token 'NUMBER' in state 'foo'        
-t_bar_ID     = r'[a-zA-Z_][a-zA-Z0-9_]*'   # Token 'ID' in state 'bar'
-
-def t_foo_newline(t):
-    r'\n'
-    t.lexer.lineno += 1
-
-
- -A token can be declared in multiple states by including multiple state names in the declaration. For example: - -
-
-t_foo_bar_NUMBER = r'\d+'         # Defines token 'NUMBER' in both state 'foo' and 'bar'
-
-
- -Alternative, a token can be declared in all states using the 'ANY' in the name. - -
-
-t_ANY_NUMBER = r'\d+'         # Defines a token 'NUMBER' in all states
-
-
- -If no state name is supplied, as is normally the case, the token is associated with a special state 'INITIAL'. For example, -these two declarations are identical: - -
-
-t_NUMBER = r'\d+'
-t_INITIAL_NUMBER = r'\d+'
-
-
- -

-States are also associated with the special t_ignore, t_error(), and t_eof() declarations. For example, if a state treats -these differently, you can declare:

- -
-
-t_foo_ignore = " \t\n"       # Ignored characters for state 'foo'
-
-def t_bar_error(t):          # Special error handler for state 'bar'
-    pass 
-
-
- -By default, lexing operates in the 'INITIAL' state. This state includes all of the normally defined tokens. -For users who aren't using different states, this fact is completely transparent. If, during lexing or parsing, you want to change -the lexing state, use the begin() method. For example: - -
-
-def t_begin_foo(t):
-    r'start_foo'
-    t.lexer.begin('foo')             # Starts 'foo' state
-
-
- -To get out of a state, you use begin() to switch back to the initial state. For example: - -
-
-def t_foo_end(t):
-    r'end_foo'
-    t.lexer.begin('INITIAL')        # Back to the initial state
-
-
- -The management of states can also be done with a stack. For example: - -
-
-def t_begin_foo(t):
-    r'start_foo'
-    t.lexer.push_state('foo')             # Starts 'foo' state
-
-def t_foo_end(t):
-    r'end_foo'
-    t.lexer.pop_state()                   # Back to the previous state
-
-
- -

-The use of a stack would be useful in situations where there are many ways of entering a new lexing state and you merely want to go back -to the previous state afterwards. - -

-An example might help clarify. Suppose you were writing a parser and you wanted to grab sections of arbitrary C code enclosed by -curly braces. That is, whenever you encounter a starting brace '{', you want to read all of the enclosed code up to the ending brace '}' -and return it as a string. Doing this with a normal regular expression rule is nearly (if not actually) impossible. This is because braces can -be nested and can be included in comments and strings. Thus, simply matching up to the first matching '}' character isn't good enough. Here is how -you might use lexer states to do this: - -

-
-# Declare the state
-states = (
-  ('ccode','exclusive'),
-)
-
-# Match the first {. Enter ccode state.
-def t_ccode(t):
-    r'\{'
-    t.lexer.code_start = t.lexer.lexpos        # Record the starting position
-    t.lexer.level = 1                          # Initial brace level
-    t.lexer.begin('ccode')                     # Enter 'ccode' state
-
-# Rules for the ccode state
-def t_ccode_lbrace(t):     
-    r'\{'
-    t.lexer.level +=1                
-
-def t_ccode_rbrace(t):
-    r'\}'
-    t.lexer.level -=1
-
-    # If closing brace, return the code fragment
-    if t.lexer.level == 0:
-         t.value = t.lexer.lexdata[t.lexer.code_start:t.lexer.lexpos+1]
-         t.type = "CCODE"
-         t.lexer.lineno += t.value.count('\n')
-         t.lexer.begin('INITIAL')           
-         return t
-
-# C or C++ comment (ignore)    
-def t_ccode_comment(t):
-    r'(/\*(.|\n)*?\*/)|(//.*)'
-    pass
-
-# C string
-def t_ccode_string(t):
-   r'\"([^\\\n]|(\\.))*?\"'
-
-# C character literal
-def t_ccode_char(t):
-   r'\'([^\\\n]|(\\.))*?\''
-
-# Any sequence of non-whitespace characters (not braces, strings)
-def t_ccode_nonspace(t):
-   r'[^\s\{\}\'\"]+'
-
-# Ignored characters (whitespace)
-t_ccode_ignore = " \t\n"
-
-# For bad characters, we just skip over it
-def t_ccode_error(t):
-    t.lexer.skip(1)
-
-
- -In this example, the occurrence of the first '{' causes the lexer to record the starting position and enter a new state 'ccode'. A collection of rules then match -various parts of the input that follow (comments, strings, etc.). All of these rules merely discard the token (by not returning a value). -However, if the closing right brace is encountered, the rule t_ccode_rbrace collects all of the code (using the earlier recorded starting -position), stores it, and returns a token 'CCODE' containing all of that text. When returning the token, the lexing state is restored back to its -initial state. - -

4.20 Miscellaneous Issues

- - -

-

  • The lexer requires input to be supplied as a single input string. Since most machines have more than enough memory, this -rarely presents a performance concern. However, it means that the lexer currently can't be used with streaming data -such as open files or sockets. This limitation is primarily a side-effect of using the re module. You might be -able to work around this by implementing an appropriate def t_eof() end-of-file handling rule. The main complication -here is that you'll probably need to ensure that data is fed to the lexer in a way so that it doesn't split in in the middle -of a token.

    - -

    -

  • The lexer should work properly with both Unicode strings given as token and pattern matching rules as -well as for input text. - -

    -

  • If you need to supply optional flags to the re.compile() function, use the reflags option to lex. For example: - -
    -
    -lex.lex(reflags=re.UNICODE | re.VERBOSE)
    -
    -
    - -Note: by default, reflags is set to re.VERBOSE. If you provide -your own flags, you may need to include this for PLY to preserve its normal behavior. - -

    -

  • Since the lexer is written entirely in Python, its performance is -largely determined by that of the Python re module. Although -the lexer has been written to be as efficient as possible, it's not -blazingly fast when used on very large input files. If -performance is concern, you might consider upgrading to the most -recent version of Python, creating a hand-written lexer, or offloading -the lexer into a C extension module. - -

    -If you are going to create a hand-written lexer and you plan to use it with yacc.py, -it only needs to conform to the following requirements: - -

    - -

    5. Parsing basics

    - - -yacc.py is used to parse language syntax. Before showing an -example, there are a few important bits of background that must be -mentioned. First, syntax is usually specified in terms of a BNF grammar. -For example, if you wanted to parse -simple arithmetic expressions, you might first write an unambiguous -grammar specification like this: - -
    -
     
    -expression : expression + term
    -           | expression - term
    -           | term
    -
    -term       : term * factor
    -           | term / factor
    -           | factor
    -
    -factor     : NUMBER
    -           | ( expression )
    -
    -
    - -In the grammar, symbols such as NUMBER, +, -, *, and / are known -as terminals and correspond to raw input tokens. Identifiers such as term and factor refer to -grammar rules comprised of a collection of terminals and other rules. These identifiers are known as non-terminals. -

    - -The semantic behavior of a language is often specified using a -technique known as syntax directed translation. In syntax directed -translation, attributes are attached to each symbol in a given grammar -rule along with an action. Whenever a particular grammar rule is -recognized, the action describes what to do. For example, given the -expression grammar above, you might write the specification for a -simple calculator like this: - -

    -
     
    -Grammar                             Action
    ---------------------------------    -------------------------------------------- 
    -expression0 : expression1 + term    expression0.val = expression1.val + term.val
    -            | expression1 - term    expression0.val = expression1.val - term.val
    -            | term                  expression0.val = term.val
    -
    -term0       : term1 * factor        term0.val = term1.val * factor.val
    -            | term1 / factor        term0.val = term1.val / factor.val
    -            | factor                term0.val = factor.val
    -
    -factor      : NUMBER                factor.val = int(NUMBER.lexval)
    -            | ( expression )        factor.val = expression.val
    -
    -
    - -A good way to think about syntax directed translation is to -view each symbol in the grammar as a kind of object. Associated -with each symbol is a value representing its "state" (for example, the -val attribute above). Semantic -actions are then expressed as a collection of functions or methods -that operate on the symbols and associated values. - -

    -Yacc uses a parsing technique known as LR-parsing or shift-reduce parsing. LR parsing is a -bottom up technique that tries to recognize the right-hand-side of various grammar rules. -Whenever a valid right-hand-side is found in the input, the appropriate action code is triggered and the -grammar symbols are replaced by the grammar symbol on the left-hand-side. - -

    -LR parsing is commonly implemented by shifting grammar symbols onto a -stack and looking at the stack and the next input token for patterns that -match one of the grammar rules. -The details of the algorithm can be found in a compiler textbook, but the -following example illustrates the steps that are performed if you -wanted to parse the expression -3 + 5 * (10 - 20) using the grammar defined above. In the example, -the special symbol $ represents the end of input. - - -

    -
    -Step Symbol Stack           Input Tokens            Action
    ----- ---------------------  ---------------------   -------------------------------
    -1                           3 + 5 * ( 10 - 20 )$    Shift 3
    -2    3                        + 5 * ( 10 - 20 )$    Reduce factor : NUMBER
    -3    factor                   + 5 * ( 10 - 20 )$    Reduce term   : factor
    -4    term                     + 5 * ( 10 - 20 )$    Reduce expr : term
    -5    expr                     + 5 * ( 10 - 20 )$    Shift +
    -6    expr +                     5 * ( 10 - 20 )$    Shift 5
    -7    expr + 5                     * ( 10 - 20 )$    Reduce factor : NUMBER
    -8    expr + factor                * ( 10 - 20 )$    Reduce term   : factor
    -9    expr + term                  * ( 10 - 20 )$    Shift *
    -10   expr + term *                  ( 10 - 20 )$    Shift (
    -11   expr + term * (                  10 - 20 )$    Shift 10
    -12   expr + term * ( 10                  - 20 )$    Reduce factor : NUMBER
    -13   expr + term * ( factor              - 20 )$    Reduce term : factor
    -14   expr + term * ( term                - 20 )$    Reduce expr : term
    -15   expr + term * ( expr                - 20 )$    Shift -
    -16   expr + term * ( expr -                20 )$    Shift 20
    -17   expr + term * ( expr - 20                )$    Reduce factor : NUMBER
    -18   expr + term * ( expr - factor            )$    Reduce term : factor
    -19   expr + term * ( expr - term              )$    Reduce expr : expr - term
    -20   expr + term * ( expr                     )$    Shift )
    -21   expr + term * ( expr )                    $    Reduce factor : (expr)
    -22   expr + term * factor                      $    Reduce term : term * factor
    -23   expr + term                               $    Reduce expr : expr + term
    -24   expr                                      $    Reduce expr
    -25                                             $    Success!
    -
    -
    - -When parsing the expression, an underlying state machine and the -current input token determine what happens next. If the next token -looks like part of a valid grammar rule (based on other items on the -stack), it is generally shifted onto the stack. If the top of the -stack contains a valid right-hand-side of a grammar rule, it is -usually "reduced" and the symbols replaced with the symbol on the -left-hand-side. When this reduction occurs, the appropriate action is -triggered (if defined). If the input token can't be shifted and the -top of stack doesn't match any grammar rules, a syntax error has -occurred and the parser must take some kind of recovery step (or bail -out). A parse is only successful if the parser reaches a state where -the symbol stack is empty and there are no more input tokens. - -

    -It is important to note that the underlying implementation is built -around a large finite-state machine that is encoded in a collection of -tables. The construction of these tables is non-trivial and -beyond the scope of this discussion. However, subtle details of this -process explain why, in the example above, the parser chooses to shift -a token onto the stack in step 9 rather than reducing the -rule expr : expr + term. - -

    6. Yacc

    - - -The ply.yacc module implements the parsing component of PLY. -The name "yacc" stands for "Yet Another Compiler Compiler" and is -borrowed from the Unix tool of the same name. - -

    6.1 An example

    - - -Suppose you wanted to make a grammar for simple arithmetic expressions as previously described. Here is -how you would do it with yacc.py: - -
    -
    -# Yacc example
    -
    -import ply.yacc as yacc
    -
    -# Get the token map from the lexer.  This is required.
    -from calclex import tokens
    -
    -def p_expression_plus(p):
    -    'expression : expression PLUS term'
    -    p[0] = p[1] + p[3]
    -
    -def p_expression_minus(p):
    -    'expression : expression MINUS term'
    -    p[0] = p[1] - p[3]
    -
    -def p_expression_term(p):
    -    'expression : term'
    -    p[0] = p[1]
    -
    -def p_term_times(p):
    -    'term : term TIMES factor'
    -    p[0] = p[1] * p[3]
    -
    -def p_term_div(p):
    -    'term : term DIVIDE factor'
    -    p[0] = p[1] / p[3]
    -
    -def p_term_factor(p):
    -    'term : factor'
    -    p[0] = p[1]
    -
    -def p_factor_num(p):
    -    'factor : NUMBER'
    -    p[0] = p[1]
    -
    -def p_factor_expr(p):
    -    'factor : LPAREN expression RPAREN'
    -    p[0] = p[2]
    -
    -# Error rule for syntax errors
    -def p_error(p):
    -    print("Syntax error in input!")
    -
    -# Build the parser
    -parser = yacc.yacc()
    -
    -while True:
    -   try:
    -       s = raw_input('calc > ')
    -   except EOFError:
    -       break
    -   if not s: continue
    -   result = parser.parse(s)
    -   print(result)
    -
    -
    - -In this example, each grammar rule is defined by a Python function -where the docstring to that function contains the appropriate -context-free grammar specification. The statements that make up the -function body implement the semantic actions of the rule. Each function -accepts a single argument p that is a sequence containing the -values of each grammar symbol in the corresponding rule. The values -of p[i] are mapped to grammar symbols as shown here: - -
    -
    -def p_expression_plus(p):
    -    'expression : expression PLUS term'
    -    #   ^            ^        ^    ^
    -    #  p[0]         p[1]     p[2] p[3]
    -
    -    p[0] = p[1] + p[3]
    -
    -
    - -

    -For tokens, the "value" of the corresponding p[i] is the -same as the p.value attribute assigned in the lexer -module. For non-terminals, the value is determined by whatever is -placed in p[0] when rules are reduced. This value can be -anything at all. However, it probably most common for the value to be -a simple Python type, a tuple, or an instance. In this example, we -are relying on the fact that the NUMBER token stores an -integer value in its value field. All of the other rules simply -perform various types of integer operations and propagate the result. -

    - -

    -Note: The use of negative indices have a special meaning in -yacc---specially p[-1] does not have the same value -as p[3] in this example. Please see the section on "Embedded -Actions" for further details. -

    - -

    -The first rule defined in the yacc specification determines the -starting grammar symbol (in this case, a rule for expression -appears first). Whenever the starting rule is reduced by the parser -and no more input is available, parsing stops and the final value is -returned (this value will be whatever the top-most rule placed -in p[0]). Note: an alternative starting symbol can be -specified using the start keyword argument to -yacc(). - -

    The p_error(p) rule is defined to catch syntax errors. -See the error handling section below for more detail. - -

    -To build the parser, call the yacc.yacc() function. This -function looks at the module and attempts to construct all of the LR -parsing tables for the grammar you have specified. The first -time yacc.yacc() is invoked, you will get a message such as -this: - -

    -
    -$ python calcparse.py
    -Generating LALR tables
    -calc > 
    -
    -
    - -

    -Since table construction is relatively expensive (especially for large -grammars), the resulting parsing table is written to -a file called parsetab.py. In addition, a -debugging file called parser.out is created. On subsequent -executions, yacc will reload the table from -parsetab.py unless it has detected a change in the underlying -grammar (in which case the tables and parsetab.py file are -regenerated). Both of these files are written to the same directory -as the module in which the parser is specified. -The name of the parsetab module can be changed using the -tabmodule keyword argument to yacc(). For example: -

    - -
    -
    -parser = yacc.yacc(tabmodule='fooparsetab')
    -
    -
    - -

    -If any errors are detected in your grammar specification, yacc.py will produce -diagnostic messages and possibly raise an exception. Some of the errors that can be detected include: - -

    - -The next few sections discuss grammar specification in more detail. - -

    -The final part of the example shows how to actually run the parser -created by -yacc(). To run the parser, you simply have to call -the parse() with a string of input text. This will run all -of the grammar rules and return the result of the entire parse. This -result return is the value assigned to p[0] in the starting -grammar rule. - -

    6.2 Combining Grammar Rule Functions

    - - -When grammar rules are similar, they can be combined into a single function. -For example, consider the two rules in our earlier example: - -
    -
    -def p_expression_plus(p):
    -    'expression : expression PLUS term'
    -    p[0] = p[1] + p[3]
    -
    -def p_expression_minus(t):
    -    'expression : expression MINUS term'
    -    p[0] = p[1] - p[3]
    -
    -
    - -Instead of writing two functions, you might write a single function like this: - -
    -
    -def p_expression(p):
    -    '''expression : expression PLUS term
    -                  | expression MINUS term'''
    -    if p[2] == '+':
    -        p[0] = p[1] + p[3]
    -    elif p[2] == '-':
    -        p[0] = p[1] - p[3]
    -
    -
    - -In general, the doc string for any given function can contain multiple grammar rules. So, it would -have also been legal (although possibly confusing) to write this: - -
    -
    -def p_binary_operators(p):
    -    '''expression : expression PLUS term
    -                  | expression MINUS term
    -       term       : term TIMES factor
    -                  | term DIVIDE factor'''
    -    if p[2] == '+':
    -        p[0] = p[1] + p[3]
    -    elif p[2] == '-':
    -        p[0] = p[1] - p[3]
    -    elif p[2] == '*':
    -        p[0] = p[1] * p[3]
    -    elif p[2] == '/':
    -        p[0] = p[1] / p[3]
    -
    -
    - -When combining grammar rules into a single function, it is usually a good idea for all of the rules to have -a similar structure (e.g., the same number of terms). Otherwise, the corresponding action code may be more -complicated than necessary. However, it is possible to handle simple cases using len(). For example: - -
    -
    -def p_expressions(p):
    -    '''expression : expression MINUS expression
    -                  | MINUS expression'''
    -    if (len(p) == 4):
    -        p[0] = p[1] - p[3]
    -    elif (len(p) == 3):
    -        p[0] = -p[2]
    -
    -
    - -If parsing performance is a concern, you should resist the urge to put -too much conditional processing into a single grammar rule as shown in -these examples. When you add checks to see which grammar rule is -being handled, you are actually duplicating the work that the parser -has already performed (i.e., the parser already knows exactly what rule it -matched). You can eliminate this overhead by using a -separate p_rule() function for each grammar rule. - -

    6.3 Character Literals

    - - -If desired, a grammar may contain tokens defined as single character literals. For example: - -
    -
    -def p_binary_operators(p):
    -    '''expression : expression '+' term
    -                  | expression '-' term
    -       term       : term '*' factor
    -                  | term '/' factor'''
    -    if p[2] == '+':
    -        p[0] = p[1] + p[3]
    -    elif p[2] == '-':
    -        p[0] = p[1] - p[3]
    -    elif p[2] == '*':
    -        p[0] = p[1] * p[3]
    -    elif p[2] == '/':
    -        p[0] = p[1] / p[3]
    -
    -
    - -A character literal must be enclosed in quotes such as '+'. In addition, if literals are used, they must be declared in the -corresponding lex file through the use of a special literals declaration. - -
    -
    -# Literals.  Should be placed in module given to lex()
    -literals = ['+','-','*','/' ]
    -
    -
    - -Character literals are limited to a single character. Thus, it is not legal to specify literals such as '<=' or '=='. For this, use -the normal lexing rules (e.g., define a rule such as t_EQ = r'=='). - -

    6.4 Empty Productions

    - - -yacc.py can handle empty productions by defining a rule like this: - -
    -
    -def p_empty(p):
    -    'empty :'
    -    pass
    -
    -
    - -Now to use the empty production, simply use 'empty' as a symbol. For example: - -
    -
    -def p_optitem(p):
    -    'optitem : item'
    -    '        | empty'
    -    ...
    -
    -
    - -Note: You can write empty rules anywhere by simply specifying an empty -right hand side. However, I personally find that writing an "empty" -rule and using "empty" to denote an empty production is easier to read -and more clearly states your intentions. - -

    6.5 Changing the starting symbol

    - - -Normally, the first rule found in a yacc specification defines the starting grammar rule (top level rule). To change this, simply -supply a start specifier in your file. For example: - -
    -
    -start = 'foo'
    -
    -def p_bar(p):
    -    'bar : A B'
    -
    -# This is the starting rule due to the start specifier above
    -def p_foo(p):
    -    'foo : bar X'
    -...
    -
    -
    - -The use of a start specifier may be useful during debugging -since you can use it to have yacc build a subset of a larger grammar. -For this purpose, it is also possible to specify a starting symbol as -an argument to yacc(). For example: - -
    -
    -parser = yacc.yacc(start='foo')
    -
    -
    - -

    6.6 Dealing With Ambiguous Grammars

    - - -The expression grammar given in the earlier example has been written -in a special format to eliminate ambiguity. However, in many -situations, it is extremely difficult or awkward to write grammars in -this format. A much more natural way to express the grammar is in a -more compact form like this: - -
    -
    -expression : expression PLUS expression
    -           | expression MINUS expression
    -           | expression TIMES expression
    -           | expression DIVIDE expression
    -           | LPAREN expression RPAREN
    -           | NUMBER
    -
    -
    - -Unfortunately, this grammar specification is ambiguous. For example, -if you are parsing the string "3 * 4 + 5", there is no way to tell how -the operators are supposed to be grouped. For example, does the -expression mean "(3 * 4) + 5" or is it "3 * (4+5)"? - -

    -When an ambiguous grammar is given to yacc.py it will print -messages about "shift/reduce conflicts" or "reduce/reduce conflicts". -A shift/reduce conflict is caused when the parser generator can't -decide whether or not to reduce a rule or shift a symbol on the -parsing stack. For example, consider the string "3 * 4 + 5" and the -internal parsing stack: - -

    -
    -Step Symbol Stack           Input Tokens            Action
    ----- ---------------------  ---------------------   -------------------------------
    -1    $                                3 * 4 + 5$    Shift 3
    -2    $ 3                                * 4 + 5$    Reduce : expression : NUMBER
    -3    $ expr                             * 4 + 5$    Shift *
    -4    $ expr *                             4 + 5$    Shift 4
    -5    $ expr * 4                             + 5$    Reduce: expression : NUMBER
    -6    $ expr * expr                          + 5$    SHIFT/REDUCE CONFLICT ????
    -
    -
    - -In this case, when the parser reaches step 6, it has two options. One -is to reduce the rule expr : expr * expr on the stack. The -other option is to shift the token + on the stack. Both -options are perfectly legal from the rules of the -context-free-grammar. - -

    -By default, all shift/reduce conflicts are resolved in favor of -shifting. Therefore, in the above example, the parser will always -shift the + instead of reducing. Although this strategy -works in many cases (for example, the case of -"if-then" versus "if-then-else"), it is not enough for arithmetic expressions. In fact, -in the above example, the decision to shift + is completely -wrong---we should have reduced expr * expr since -multiplication has higher mathematical precedence than addition. - -

    To resolve ambiguity, especially in expression -grammars, yacc.py allows individual tokens to be assigned a -precedence level and associativity. This is done by adding a variable -precedence to the grammar file like this: - -

    -
    -precedence = (
    -    ('left', 'PLUS', 'MINUS'),
    -    ('left', 'TIMES', 'DIVIDE'),
    -)
    -
    -
    - -This declaration specifies that PLUS/MINUS have the -same precedence level and are left-associative and that -TIMES/DIVIDE have the same precedence and are -left-associative. Within the precedence declaration, tokens -are ordered from lowest to highest precedence. Thus, this declaration -specifies that TIMES/DIVIDE have higher precedence -than PLUS/MINUS (since they appear later in the -precedence specification). - -

    -The precedence specification works by associating a numerical -precedence level value and associativity direction to the listed -tokens. For example, in the above example you get: - -

    -
    -PLUS      : level = 1,  assoc = 'left'
    -MINUS     : level = 1,  assoc = 'left'
    -TIMES     : level = 2,  assoc = 'left'
    -DIVIDE    : level = 2,  assoc = 'left'
    -
    -
    - -These values are then used to attach a numerical precedence value and -associativity direction to each grammar rule. This is always -determined by looking at the precedence of the right-most terminal -symbol. For example: - -
    -
    -expression : expression PLUS expression                 # level = 1, left
    -           | expression MINUS expression                # level = 1, left
    -           | expression TIMES expression                # level = 2, left
    -           | expression DIVIDE expression               # level = 2, left
    -           | LPAREN expression RPAREN                   # level = None (not specified)
    -           | NUMBER                                     # level = None (not specified)
    -
    -
    - -When shift/reduce conflicts are encountered, the parser generator resolves the conflict by -looking at the precedence rules and associativity specifiers. - -

    -

      -
    1. If the current token has higher precedence than the rule on the stack, it is shifted. -
    2. If the grammar rule on the stack has higher precedence, the rule is reduced. -
    3. If the current token and the grammar rule have the same precedence, the -rule is reduced for left associativity, whereas the token is shifted for right associativity. -
    4. If nothing is known about the precedence, shift/reduce conflicts are resolved in -favor of shifting (the default). -
    - -For example, if "expression PLUS expression" has been parsed and the -next token is "TIMES", the action is going to be a shift because -"TIMES" has a higher precedence level than "PLUS". On the other hand, -if "expression TIMES expression" has been parsed and the next token is -"PLUS", the action is going to be reduce because "PLUS" has a lower -precedence than "TIMES." - -

    -When shift/reduce conflicts are resolved using the first three -techniques (with the help of precedence rules), yacc.py will -report no errors or conflicts in the grammar (although it will print -some information in the parser.out debugging file). - -

    -One problem with the precedence specifier technique is that it is -sometimes necessary to change the precedence of an operator in certain -contexts. For example, consider a unary-minus operator in "3 + 4 * --5". Mathematically, the unary minus is normally given a very high -precedence--being evaluated before the multiply. However, in our -precedence specifier, MINUS has a lower precedence than TIMES. To -deal with this, precedence rules can be given for so-called "fictitious tokens" -like this: - -

    -
    -precedence = (
    -    ('left', 'PLUS', 'MINUS'),
    -    ('left', 'TIMES', 'DIVIDE'),
    -    ('right', 'UMINUS'),            # Unary minus operator
    -)
    -
    -
    - -Now, in the grammar file, we can write our unary minus rule like this: - -
    -
    -def p_expr_uminus(p):
    -    'expression : MINUS expression %prec UMINUS'
    -    p[0] = -p[2]
    -
    -
    - -In this case, %prec UMINUS overrides the default rule precedence--setting it to that -of UMINUS in the precedence specifier. - -

    -At first, the use of UMINUS in this example may appear very confusing. -UMINUS is not an input token or a grammar rule. Instead, you should -think of it as the name of a special marker in the precedence table. When you use the %prec qualifier, you're simply -telling yacc that you want the precedence of the expression to be the same as for this special marker instead of the usual precedence. - -

    -It is also possible to specify non-associativity in the precedence table. This would -be used when you don't want operations to chain together. For example, suppose -you wanted to support comparison operators like < and > but you didn't want to allow -combinations like a < b < c. To do this, simply specify a rule like this: - -

    -
    -precedence = (
    -    ('nonassoc', 'LESSTHAN', 'GREATERTHAN'),  # Nonassociative operators
    -    ('left', 'PLUS', 'MINUS'),
    -    ('left', 'TIMES', 'DIVIDE'),
    -    ('right', 'UMINUS'),            # Unary minus operator
    -)
    -
    -
    - -

    -If you do this, the occurrence of input text such as a < b < c will result in a syntax error. However, simple -expressions such as a < b will still be fine. - -

    -Reduce/reduce conflicts are caused when there are multiple grammar -rules that can be applied to a given set of symbols. This kind of -conflict is almost always bad and is always resolved by picking the -rule that appears first in the grammar file. Reduce/reduce conflicts -are almost always caused when different sets of grammar rules somehow -generate the same set of symbols. For example: - -

    -
    -assignment :  ID EQUALS NUMBER
    -           |  ID EQUALS expression
    -           
    -expression : expression PLUS expression
    -           | expression MINUS expression
    -           | expression TIMES expression
    -           | expression DIVIDE expression
    -           | LPAREN expression RPAREN
    -           | NUMBER
    -
    -
    - -In this case, a reduce/reduce conflict exists between these two rules: - -
    -
    -assignment  : ID EQUALS NUMBER
    -expression  : NUMBER
    -
    -
    - -For example, if you wrote "a = 5", the parser can't figure out if this -is supposed to be reduced as assignment : ID EQUALS NUMBER or -whether it's supposed to reduce the 5 as an expression and then reduce -the rule assignment : ID EQUALS expression. - -

    -It should be noted that reduce/reduce conflicts are notoriously -difficult to spot simply looking at the input grammar. When a -reduce/reduce conflict occurs, yacc() will try to help by -printing a warning message such as this: - -

    -
    -WARNING: 1 reduce/reduce conflict
    -WARNING: reduce/reduce conflict in state 15 resolved using rule (assignment -> ID EQUALS NUMBER)
    -WARNING: rejected rule (expression -> NUMBER)
    -
    -
    - -This message identifies the two rules that are in conflict. However, -it may not tell you how the parser arrived at such a state. To try -and figure it out, you'll probably have to look at your grammar and -the contents of the -parser.out debugging file with an appropriately high level of -caffeination. - -

    6.7 The parser.out file

    - - -Tracking down shift/reduce and reduce/reduce conflicts is one of the finer pleasures of using an LR -parsing algorithm. To assist in debugging, yacc.py creates a debugging file called -'parser.out' when it generates the parsing table. The contents of this file look like the following: - -
    -
    -Unused terminals:
    -
    -
    -Grammar
    -
    -Rule 1     expression -> expression PLUS expression
    -Rule 2     expression -> expression MINUS expression
    -Rule 3     expression -> expression TIMES expression
    -Rule 4     expression -> expression DIVIDE expression
    -Rule 5     expression -> NUMBER
    -Rule 6     expression -> LPAREN expression RPAREN
    -
    -Terminals, with rules where they appear
    -
    -TIMES                : 3
    -error                : 
    -MINUS                : 2
    -RPAREN               : 6
    -LPAREN               : 6
    -DIVIDE               : 4
    -PLUS                 : 1
    -NUMBER               : 5
    -
    -Nonterminals, with rules where they appear
    -
    -expression           : 1 1 2 2 3 3 4 4 6 0
    -
    -
    -Parsing method: LALR
    -
    -
    -state 0
    -
    -    S' -> . expression
    -    expression -> . expression PLUS expression
    -    expression -> . expression MINUS expression
    -    expression -> . expression TIMES expression
    -    expression -> . expression DIVIDE expression
    -    expression -> . NUMBER
    -    expression -> . LPAREN expression RPAREN
    -
    -    NUMBER          shift and go to state 3
    -    LPAREN          shift and go to state 2
    -
    -
    -state 1
    -
    -    S' -> expression .
    -    expression -> expression . PLUS expression
    -    expression -> expression . MINUS expression
    -    expression -> expression . TIMES expression
    -    expression -> expression . DIVIDE expression
    -
    -    PLUS            shift and go to state 6
    -    MINUS           shift and go to state 5
    -    TIMES           shift and go to state 4
    -    DIVIDE          shift and go to state 7
    -
    -
    -state 2
    -
    -    expression -> LPAREN . expression RPAREN
    -    expression -> . expression PLUS expression
    -    expression -> . expression MINUS expression
    -    expression -> . expression TIMES expression
    -    expression -> . expression DIVIDE expression
    -    expression -> . NUMBER
    -    expression -> . LPAREN expression RPAREN
    -
    -    NUMBER          shift and go to state 3
    -    LPAREN          shift and go to state 2
    -
    -
    -state 3
    -
    -    expression -> NUMBER .
    -
    -    $               reduce using rule 5
    -    PLUS            reduce using rule 5
    -    MINUS           reduce using rule 5
    -    TIMES           reduce using rule 5
    -    DIVIDE          reduce using rule 5
    -    RPAREN          reduce using rule 5
    -
    -
    -state 4
    -
    -    expression -> expression TIMES . expression
    -    expression -> . expression PLUS expression
    -    expression -> . expression MINUS expression
    -    expression -> . expression TIMES expression
    -    expression -> . expression DIVIDE expression
    -    expression -> . NUMBER
    -    expression -> . LPAREN expression RPAREN
    -
    -    NUMBER          shift and go to state 3
    -    LPAREN          shift and go to state 2
    -
    -
    -state 5
    -
    -    expression -> expression MINUS . expression
    -    expression -> . expression PLUS expression
    -    expression -> . expression MINUS expression
    -    expression -> . expression TIMES expression
    -    expression -> . expression DIVIDE expression
    -    expression -> . NUMBER
    -    expression -> . LPAREN expression RPAREN
    -
    -    NUMBER          shift and go to state 3
    -    LPAREN          shift and go to state 2
    -
    -
    -state 6
    -
    -    expression -> expression PLUS . expression
    -    expression -> . expression PLUS expression
    -    expression -> . expression MINUS expression
    -    expression -> . expression TIMES expression
    -    expression -> . expression DIVIDE expression
    -    expression -> . NUMBER
    -    expression -> . LPAREN expression RPAREN
    -
    -    NUMBER          shift and go to state 3
    -    LPAREN          shift and go to state 2
    -
    -
    -state 7
    -
    -    expression -> expression DIVIDE . expression
    -    expression -> . expression PLUS expression
    -    expression -> . expression MINUS expression
    -    expression -> . expression TIMES expression
    -    expression -> . expression DIVIDE expression
    -    expression -> . NUMBER
    -    expression -> . LPAREN expression RPAREN
    -
    -    NUMBER          shift and go to state 3
    -    LPAREN          shift and go to state 2
    -
    -
    -state 8
    -
    -    expression -> LPAREN expression . RPAREN
    -    expression -> expression . PLUS expression
    -    expression -> expression . MINUS expression
    -    expression -> expression . TIMES expression
    -    expression -> expression . DIVIDE expression
    -
    -    RPAREN          shift and go to state 13
    -    PLUS            shift and go to state 6
    -    MINUS           shift and go to state 5
    -    TIMES           shift and go to state 4
    -    DIVIDE          shift and go to state 7
    -
    -
    -state 9
    -
    -    expression -> expression TIMES expression .
    -    expression -> expression . PLUS expression
    -    expression -> expression . MINUS expression
    -    expression -> expression . TIMES expression
    -    expression -> expression . DIVIDE expression
    -
    -    $               reduce using rule 3
    -    PLUS            reduce using rule 3
    -    MINUS           reduce using rule 3
    -    TIMES           reduce using rule 3
    -    DIVIDE          reduce using rule 3
    -    RPAREN          reduce using rule 3
    -
    -  ! PLUS            [ shift and go to state 6 ]
    -  ! MINUS           [ shift and go to state 5 ]
    -  ! TIMES           [ shift and go to state 4 ]
    -  ! DIVIDE          [ shift and go to state 7 ]
    -
    -state 10
    -
    -    expression -> expression MINUS expression .
    -    expression -> expression . PLUS expression
    -    expression -> expression . MINUS expression
    -    expression -> expression . TIMES expression
    -    expression -> expression . DIVIDE expression
    -
    -    $               reduce using rule 2
    -    PLUS            reduce using rule 2
    -    MINUS           reduce using rule 2
    -    RPAREN          reduce using rule 2
    -    TIMES           shift and go to state 4
    -    DIVIDE          shift and go to state 7
    -
    -  ! TIMES           [ reduce using rule 2 ]
    -  ! DIVIDE          [ reduce using rule 2 ]
    -  ! PLUS            [ shift and go to state 6 ]
    -  ! MINUS           [ shift and go to state 5 ]
    -
    -state 11
    -
    -    expression -> expression PLUS expression .
    -    expression -> expression . PLUS expression
    -    expression -> expression . MINUS expression
    -    expression -> expression . TIMES expression
    -    expression -> expression . DIVIDE expression
    -
    -    $               reduce using rule 1
    -    PLUS            reduce using rule 1
    -    MINUS           reduce using rule 1
    -    RPAREN          reduce using rule 1
    -    TIMES           shift and go to state 4
    -    DIVIDE          shift and go to state 7
    -
    -  ! TIMES           [ reduce using rule 1 ]
    -  ! DIVIDE          [ reduce using rule 1 ]
    -  ! PLUS            [ shift and go to state 6 ]
    -  ! MINUS           [ shift and go to state 5 ]
    -
    -state 12
    -
    -    expression -> expression DIVIDE expression .
    -    expression -> expression . PLUS expression
    -    expression -> expression . MINUS expression
    -    expression -> expression . TIMES expression
    -    expression -> expression . DIVIDE expression
    -
    -    $               reduce using rule 4
    -    PLUS            reduce using rule 4
    -    MINUS           reduce using rule 4
    -    TIMES           reduce using rule 4
    -    DIVIDE          reduce using rule 4
    -    RPAREN          reduce using rule 4
    -
    -  ! PLUS            [ shift and go to state 6 ]
    -  ! MINUS           [ shift and go to state 5 ]
    -  ! TIMES           [ shift and go to state 4 ]
    -  ! DIVIDE          [ shift and go to state 7 ]
    -
    -state 13
    -
    -    expression -> LPAREN expression RPAREN .
    -
    -    $               reduce using rule 6
    -    PLUS            reduce using rule 6
    -    MINUS           reduce using rule 6
    -    TIMES           reduce using rule 6
    -    DIVIDE          reduce using rule 6
    -    RPAREN          reduce using rule 6
    -
    -
    - -The different states that appear in this file are a representation of -every possible sequence of valid input tokens allowed by the grammar. -When receiving input tokens, the parser is building up a stack and -looking for matching rules. Each state keeps track of the grammar -rules that might be in the process of being matched at that point. Within each -rule, the "." character indicates the current location of the parse -within that rule. In addition, the actions for each valid input token -are listed. When a shift/reduce or reduce/reduce conflict arises, -rules not selected are prefixed with an !. For example: - -
    -
    -  ! TIMES           [ reduce using rule 2 ]
    -  ! DIVIDE          [ reduce using rule 2 ]
    -  ! PLUS            [ shift and go to state 6 ]
    -  ! MINUS           [ shift and go to state 5 ]
    -
    -
    - -By looking at these rules (and with a little practice), you can usually track down the source -of most parsing conflicts. It should also be stressed that not all shift-reduce conflicts are -bad. However, the only way to be sure that they are resolved correctly is to look at parser.out. - -

    6.8 Syntax Error Handling

    - - -If you are creating a parser for production use, the handling of -syntax errors is important. As a general rule, you don't want a -parser to simply throw up its hands and stop at the first sign of -trouble. Instead, you want it to report the error, recover if possible, and -continue parsing so that all of the errors in the input get reported -to the user at once. This is the standard behavior found in compilers -for languages such as C, C++, and Java. - -In PLY, when a syntax error occurs during parsing, the error is immediately -detected (i.e., the parser does not read any more tokens beyond the -source of the error). However, at this point, the parser enters a -recovery mode that can be used to try and continue further parsing. -As a general rule, error recovery in LR parsers is a delicate -topic that involves ancient rituals and black-magic. The recovery mechanism -provided by yacc.py is comparable to Unix yacc so you may want -consult a book like O'Reilly's "Lex and Yacc" for some of the finer details. - -

    -When a syntax error occurs, yacc.py performs the following steps: - -

      -
    1. On the first occurrence of an error, the user-defined p_error() function -is called with the offending token as an argument. However, if the syntax error is due to -reaching the end-of-file, p_error() is called with an - argument of None. -Afterwards, the parser enters -an "error-recovery" mode in which it will not make future calls to p_error() until it -has successfully shifted at least 3 tokens onto the parsing stack. - -

      -

    2. If no recovery action is taken in p_error(), the offending lookahead token is replaced -with a special error token. - -

      -

    3. If the offending lookahead token is already set to error, the top item of the parsing stack is -deleted. - -

      -

    4. If the entire parsing stack is unwound, the parser enters a restart state and attempts to start -parsing from its initial state. - -

      -

    5. If a grammar rule accepts error as a token, it will be -shifted onto the parsing stack. - -

      -

    6. If the top item of the parsing stack is error, lookahead tokens will be discarded until the -parser can successfully shift a new symbol or reduce a rule involving error. -
    - -

    6.8.1 Recovery and resynchronization with error rules

    - - -The most well-behaved approach for handling syntax errors is to write grammar rules that include the error -token. For example, suppose your language had a grammar rule for a print statement like this: - -
    -
    -def p_statement_print(p):
    -     'statement : PRINT expr SEMI'
    -     ...
    -
    -
    - -To account for the possibility of a bad expression, you might write an additional grammar rule like this: - -
    -
    -def p_statement_print_error(p):
    -     'statement : PRINT error SEMI'
    -     print("Syntax error in print statement. Bad expression")
    -
    -
    -
    - -In this case, the error token will match any sequence of -tokens that might appear up to the first semicolon that is -encountered. Once the semicolon is reached, the rule will be -invoked and the error token will go away. - -

    -This type of recovery is sometimes known as parser resynchronization. -The error token acts as a wildcard for any bad input text and -the token immediately following error acts as a -synchronization token. - -

    -It is important to note that the error token usually does not appear as the last token -on the right in an error rule. For example: - -

    -
    -def p_statement_print_error(p):
    -    'statement : PRINT error'
    -    print("Syntax error in print statement. Bad expression")
    -
    -
    - -This is because the first bad token encountered will cause the rule to -be reduced--which may make it difficult to recover if more bad tokens -immediately follow. - -

    6.8.2 Panic mode recovery

    - - -An alternative error recovery scheme is to enter a panic mode recovery in which tokens are -discarded to a point where the parser might be able to recover in some sensible manner. - -

    -Panic mode recovery is implemented entirely in the p_error() function. For example, this -function starts discarding tokens until it reaches a closing '}'. Then, it restarts the -parser in its initial state. - -

    -
    -def p_error(p):
    -    print("Whoa. You are seriously hosed.")
    -    if not p:
    -        print("End of File!")
    -        return
    -
    -    # Read ahead looking for a closing '}'
    -    while True:
    -        tok = parser.token()             # Get the next token
    -        if not tok or tok.type == 'RBRACE': 
    -            break
    -    parser.restart()
    -
    -
    - -

    -This function simply discards the bad token and tells the parser that the error was ok. - -

    -
    -def p_error(p):
    -    if p:
    -         print("Syntax error at token", p.type)
    -         # Just discard the token and tell the parser it's okay.
    -         parser.errok()
    -    else:
    -         print("Syntax error at EOF")
    -
    -
    - -

    -More information on these methods is as follows: -

    - -

    -

    - -

    -To supply the next lookahead token to the parser, p_error() can return a token. This might be -useful if trying to synchronize on special characters. For example: - -

    -
    -def p_error(p):
    -    # Read ahead looking for a terminating ";"
    -    while True:
    -        tok = parser.token()             # Get the next token
    -        if not tok or tok.type == 'SEMI': break
    -    parser.errok()
    -
    -    # Return SEMI to the parser as the next lookahead token
    -    return tok  
    -
    -
    - -

    -Keep in mind in that the above error handling functions, -parser is an instance of the parser created by -yacc(). You'll need to save this instance someplace in your -code so that you can refer to it during error handling. -

    - -

    6.8.3 Signalling an error from a production

    - - -If necessary, a production rule can manually force the parser to enter error recovery. This -is done by raising the SyntaxError exception like this: - -
    -
    -def p_production(p):
    -    'production : some production ...'
    -    raise SyntaxError
    -
    -
    - -The effect of raising SyntaxError is the same as if the last symbol shifted onto the -parsing stack was actually a syntax error. Thus, when you do this, the last symbol shifted is popped off -of the parsing stack and the current lookahead token is set to an error token. The parser -then enters error-recovery mode where it tries to reduce rules that can accept error tokens. -The steps that follow from this point are exactly the same as if a syntax error were detected and -p_error() were called. - -

    -One important aspect of manually setting an error is that the p_error() function will NOT be -called in this case. If you need to issue an error message, make sure you do it in the production that -raises SyntaxError. - -

    -Note: This feature of PLY is meant to mimic the behavior of the YYERROR macro in yacc. - -

    6.8.4 When Do Syntax Errors Get Reported

    - - -

    -In most cases, yacc will handle errors as soon as a bad input token is -detected on the input. However, be aware that yacc may choose to -delay error handling until after it has reduced one or more grammar -rules first. This behavior might be unexpected, but it's related to -special states in the underlying parsing table known as "defaulted -states." A defaulted state is parsing condition where the same -grammar rule will be reduced regardless of what valid token -comes next on the input. For such states, yacc chooses to go ahead -and reduce the grammar rule without reading the next input -token. If the next token is bad, yacc will eventually get around to reading it and -report a syntax error. It's just a little unusual in that you might -see some of your grammar rules firing immediately prior to the syntax -error. -

    - -

    -Usually, the delayed error reporting with defaulted states is harmless -(and there are other reasons for wanting PLY to behave in this way). -However, if you need to turn this behavior off for some reason. You -can clear the defaulted states table like this: -

    - -
    -
    -parser = yacc.yacc()
    -parser.defaulted_states = {}
    -
    -
    - -

    -Disabling defaulted states is not recommended if your grammar makes use -of embedded actions as described in Section 6.11.

    - -

    6.8.5 General comments on error handling

    - - -For normal types of languages, error recovery with error rules and resynchronization characters is probably the most reliable -technique. This is because you can instrument the grammar to catch errors at selected places where it is relatively easy -to recover and continue parsing. Panic mode recovery is really only useful in certain specialized applications where you might want -to discard huge portions of the input text to find a valid restart point. - -

    6.9 Line Number and Position Tracking

    - - -Position tracking is often a tricky problem when writing compilers. -By default, PLY tracks the line number and position of all tokens. -This information is available using the following functions: - - - -For example: - -
    -
    -def p_expression(p):
    -    'expression : expression PLUS expression'
    -    line   = p.lineno(2)        # line number of the PLUS token
    -    index  = p.lexpos(2)        # Position of the PLUS token
    -
    -
    - -As an optional feature, yacc.py can automatically track line -numbers and positions for all of the grammar symbols as well. -However, this extra tracking requires extra processing and can -significantly slow down parsing. Therefore, it must be enabled by -passing the -tracking=True option to yacc.parse(). For example: - -
    -
    -yacc.parse(data,tracking=True)
    -
    -
    - -Once enabled, the lineno() and lexpos() methods work -for all grammar symbols. In addition, two additional methods can be -used: - - - -For example: - -
    -
    -def p_expression(p):
    -    'expression : expression PLUS expression'
    -    p.lineno(1)        # Line number of the left expression
    -    p.lineno(2)        # line number of the PLUS operator
    -    p.lineno(3)        # line number of the right expression
    -    ...
    -    start,end = p.linespan(3)    # Start,end lines of the right expression
    -    starti,endi = p.lexspan(3)   # Start,end positions of right expression
    -
    -
    -
    - -Note: The lexspan() function only returns the range of values up to the start of the last grammar symbol. - -

    -Although it may be convenient for PLY to track position information on -all grammar symbols, this is often unnecessary. For example, if you -are merely using line number information in an error message, you can -often just key off of a specific token in the grammar rule. For -example: - -

    -
    -def p_bad_func(p):
    -    'funccall : fname LPAREN error RPAREN'
    -    # Line number reported from LPAREN token
    -    print("Bad function call at line", p.lineno(2))
    -
    -
    - -

    -Similarly, you may get better parsing performance if you only -selectively propagate line number information where it's needed using -the p.set_lineno() method. For example: - -

    -
    -def p_fname(p):
    -    'fname : ID'
    -    p[0] = p[1]
    -    p.set_lineno(0,p.lineno(1))
    -
    -
    - -PLY doesn't retain line number information from rules that have already been -parsed. If you are building an abstract syntax tree and need to have line numbers, -you should make sure that the line numbers appear in the tree itself. - -

    6.10 AST Construction

    - - -yacc.py provides no special functions for constructing an -abstract syntax tree. However, such construction is easy enough to do -on your own. - -

    A minimal way to construct a tree is to simply create and -propagate a tuple or list in each grammar rule function. There -are many possible ways to do this, but one example would be something -like this: - -

    -
    -def p_expression_binop(p):
    -    '''expression : expression PLUS expression
    -                  | expression MINUS expression
    -                  | expression TIMES expression
    -                  | expression DIVIDE expression'''
    -
    -    p[0] = ('binary-expression',p[2],p[1],p[3])
    -
    -def p_expression_group(p):
    -    'expression : LPAREN expression RPAREN'
    -    p[0] = ('group-expression',p[2])
    -
    -def p_expression_number(p):
    -    'expression : NUMBER'
    -    p[0] = ('number-expression',p[1])
    -
    -
    - -

    -Another approach is to create a set of data structure for different -kinds of abstract syntax tree nodes and assign nodes to p[0] -in each rule. For example: - -

    -
    -class Expr: pass
    -
    -class BinOp(Expr):
    -    def __init__(self,left,op,right):
    -        self.type = "binop"
    -        self.left = left
    -        self.right = right
    -        self.op = op
    -
    -class Number(Expr):
    -    def __init__(self,value):
    -        self.type = "number"
    -        self.value = value
    -
    -def p_expression_binop(p):
    -    '''expression : expression PLUS expression
    -                  | expression MINUS expression
    -                  | expression TIMES expression
    -                  | expression DIVIDE expression'''
    -
    -    p[0] = BinOp(p[1],p[2],p[3])
    -
    -def p_expression_group(p):
    -    'expression : LPAREN expression RPAREN'
    -    p[0] = p[2]
    -
    -def p_expression_number(p):
    -    'expression : NUMBER'
    -    p[0] = Number(p[1])
    -
    -
    - -The advantage to this approach is that it may make it easier to attach more complicated -semantics, type checking, code generation, and other features to the node classes. - -

    -To simplify tree traversal, it may make sense to pick a very generic -tree structure for your parse tree nodes. For example: - -

    -
    -class Node:
    -    def __init__(self,type,children=None,leaf=None):
    -         self.type = type
    -         if children:
    -              self.children = children
    -         else:
    -              self.children = [ ]
    -         self.leaf = leaf
    -	 
    -def p_expression_binop(p):
    -    '''expression : expression PLUS expression
    -                  | expression MINUS expression
    -                  | expression TIMES expression
    -                  | expression DIVIDE expression'''
    -
    -    p[0] = Node("binop", [p[1],p[3]], p[2])
    -
    -
    - -

    6.11 Embedded Actions

    - - -The parsing technique used by yacc only allows actions to be executed at the end of a rule. For example, -suppose you have a rule like this: - -
    -
    -def p_foo(p):
    -    "foo : A B C D"
    -    print("Parsed a foo", p[1],p[2],p[3],p[4])
    -
    -
    - -

    -In this case, the supplied action code only executes after all of the -symbols A, B, C, and D have been -parsed. Sometimes, however, it is useful to execute small code -fragments during intermediate stages of parsing. For example, suppose -you wanted to perform some action immediately after A has -been parsed. To do this, write an empty rule like this: - -

    -
    -def p_foo(p):
    -    "foo : A seen_A B C D"
    -    print("Parsed a foo", p[1],p[3],p[4],p[5])
    -    print("seen_A returned", p[2])
    -
    -def p_seen_A(p):
    -    "seen_A :"
    -    print("Saw an A = ", p[-1])   # Access grammar symbol to left
    -    p[0] = some_value            # Assign value to seen_A
    -
    -
    -
    - -

    -In this example, the empty seen_A rule executes immediately -after A is shifted onto the parsing stack. Within this -rule, p[-1] refers to the symbol on the stack that appears -immediately to the left of the seen_A symbol. In this case, -it would be the value of A in the foo rule -immediately above. Like other rules, a value can be returned from an -embedded action by simply assigning it to p[0] - -

    -The use of embedded actions can sometimes introduce extra shift/reduce conflicts. For example, -this grammar has no conflicts: - -

    -
    -def p_foo(p):
    -    """foo : abcd
    -           | abcx"""
    -
    -def p_abcd(p):
    -    "abcd : A B C D"
    -
    -def p_abcx(p):
    -    "abcx : A B C X"
    -
    -
    - -However, if you insert an embedded action into one of the rules like this, - -
    -
    -def p_foo(p):
    -    """foo : abcd
    -           | abcx"""
    -
    -def p_abcd(p):
    -    "abcd : A B C D"
    -
    -def p_abcx(p):
    -    "abcx : A B seen_AB C X"
    -
    -def p_seen_AB(p):
    -    "seen_AB :"
    -
    -
    - -an extra shift-reduce conflict will be introduced. This conflict is -caused by the fact that the same symbol C appears next in -both the abcd and abcx rules. The parser can either -shift the symbol (abcd rule) or reduce the empty -rule seen_AB (abcx rule). - -

    -A common use of embedded rules is to control other aspects of parsing -such as scoping of local variables. For example, if you were parsing C code, you might -write code like this: - -

    -
    -def p_statements_block(p):
    -    "statements: LBRACE new_scope statements RBRACE"""
    -    # Action code
    -    ...
    -    pop_scope()        # Return to previous scope
    -
    -def p_new_scope(p):
    -    "new_scope :"
    -    # Create a new scope for local variables
    -    s = new_scope()
    -    push_scope(s)
    -    ...
    -
    -
    - -In this case, the embedded action new_scope executes -immediately after a LBRACE ({) symbol is parsed. -This might adjust internal symbol tables and other aspects of the -parser. Upon completion of the rule statements_block, code -might undo the operations performed in the embedded action -(e.g., pop_scope()). - -

    6.12 Miscellaneous Yacc Notes

    - - - -

    - - -

    7. Multiple Parsers and Lexers

    - - -In advanced parsing applications, you may want to have multiple -parsers and lexers. - -

    -As a general rules this isn't a problem. However, to make it work, -you need to carefully make sure everything gets hooked up correctly. -First, make sure you save the objects returned by lex() and -yacc(). For example: - -

    -
    -lexer  = lex.lex()       # Return lexer object
    -parser = yacc.yacc()     # Return parser object
    -
    -
    - -Next, when parsing, make sure you give the parse() function a reference to the lexer it -should be using. For example: - -
    -
    -parser.parse(text,lexer=lexer)
    -
    -
    - -If you forget to do this, the parser will use the last lexer -created--which is not always what you want. - -

    -Within lexer and parser rule functions, these objects are also -available. In the lexer, the "lexer" attribute of a token refers to -the lexer object that triggered the rule. For example: - -

    -
    -def t_NUMBER(t):
    -   r'\d+'
    -   ...
    -   print(t.lexer)           # Show lexer object
    -
    -
    - -In the parser, the "lexer" and "parser" attributes refer to the lexer -and parser objects respectively. - -
    -
    -def p_expr_plus(p):
    -   'expr : expr PLUS expr'
    -   ...
    -   print(p.parser)          # Show parser object
    -   print(p.lexer)           # Show lexer object
    -
    -
    - -If necessary, arbitrary attributes can be attached to the lexer or parser object. -For example, if you wanted to have different parsing modes, you could attach a mode -attribute to the parser object and look at it later. - -

    8. Using Python's Optimized Mode

    - - -Because PLY uses information from doc-strings, parsing and lexing -information must be gathered while running the Python interpreter in -normal mode (i.e., not with the -O or -OO options). However, if you -specify optimized mode like this: - -
    -
    -lex.lex(optimize=1)
    -yacc.yacc(optimize=1)
    -
    -
    - -then PLY can later be used when Python runs in optimized mode. To make this work, -make sure you first run Python in normal mode. Once the lexing and parsing tables -have been generated the first time, run Python in optimized mode. PLY will use -the tables without the need for doc strings. - -

    -Beware: running PLY in optimized mode disables a lot of error -checking. You should only do this when your project has stabilized -and you don't need to do any debugging. One of the purposes of -optimized mode is to substantially decrease the startup time of -your compiler (by assuming that everything is already properly -specified and works). - -

    9. Advanced Debugging

    - - -

    -Debugging a compiler is typically not an easy task. PLY provides some -advanced diagostic capabilities through the use of Python's -logging module. The next two sections describe this: - -

    9.1 Debugging the lex() and yacc() commands

    - - -

    -Both the lex() and yacc() commands have a debugging -mode that can be enabled using the debug flag. For example: - -

    -
    -lex.lex(debug=True)
    -yacc.yacc(debug=True)
    -
    -
    - -Normally, the output produced by debugging is routed to either -standard error or, in the case of yacc(), to a file -parser.out. This output can be more carefully controlled -by supplying a logging object. Here is an example that adds -information about where different debugging messages are coming from: - -
    -
    -# Set up a logging object
    -import logging
    -logging.basicConfig(
    -    level = logging.DEBUG,
    -    filename = "parselog.txt",
    -    filemode = "w",
    -    format = "%(filename)10s:%(lineno)4d:%(message)s"
    -)
    -log = logging.getLogger()
    -
    -lex.lex(debug=True,debuglog=log)
    -yacc.yacc(debug=True,debuglog=log)
    -
    -
    - -If you supply a custom logger, the amount of debugging -information produced can be controlled by setting the logging level. -Typically, debugging messages are either issued at the DEBUG, -INFO, or WARNING levels. - -

    -PLY's error messages and warnings are also produced using the logging -interface. This can be controlled by passing a logging object -using the errorlog parameter. - -

    -
    -lex.lex(errorlog=log)
    -yacc.yacc(errorlog=log)
    -
    -
    - -If you want to completely silence warnings, you can either pass in a -logging object with an appropriate filter level or use the NullLogger -object defined in either lex or yacc. For example: - -
    -
    -yacc.yacc(errorlog=yacc.NullLogger())
    -
    -
    - -

    9.2 Run-time Debugging

    - - -

    -To enable run-time debugging of a parser, use the debug option to parse. This -option can either be an integer (which simply turns debugging on or off) or an instance -of a logger object. For example: - -

    -
    -log = logging.getLogger()
    -parser.parse(input,debug=log)
    -
    -
    - -If a logging object is passed, you can use its filtering level to control how much -output gets generated. The INFO level is used to produce information -about rule reductions. The DEBUG level will show information about the -parsing stack, token shifts, and other details. The ERROR level shows information -related to parsing errors. - -

    -For very complicated problems, you should pass in a logging object that -redirects to a file where you can more easily inspect the output after -execution. - -

    10. Packaging Advice

    - - -

    -If you are distributing a package that makes use of PLY, you should -spend a few moments thinking about how you want to handle the files -that are automatically generated. For example, the parsetab.py -file generated by the yacc() function.

    - -

    -Starting in PLY-3.6, the table files are created in the same directory -as the file where a parser is defined. This means that the -parsetab.py file will live side-by-side with your parser -specification. In terms of packaging, this is probably the easiest and -most sane approach to manage. You don't need to give yacc() -any extra arguments and it should just "work."

    - -

    -One concern is the management of the parsetab.py file itself. -For example, should you have this file checked into version control (e.g., GitHub), -should it be included in a package distribution as a normal file, or should you -just let PLY generate it automatically for the user when they install your package? -

    - -

    -As of PLY-3.6, the parsetab.py file should be compatible across all versions -of Python including Python 2 and 3. Thus, a table file generated in Python 2 should -work fine if it's used on Python 3. Because of this, it should be relatively harmless -to distribute the parsetab.py file yourself if you need to. However, be aware -that older/newer versions of PLY may try to regenerate the file if there are future -enhancements or changes to its format. -

    - -

    -To make the generation of table files easier for the purposes of installation, you might -way to make your parser files executable using the -m option or similar. For -example: -

    - -
    -
    -# calc.py
    -...
    -...
    -def make_parser():
    -    parser = yacc.yacc()
    -    return parser
    -
    -if __name__ == '__main__':
    -    make_parser()
    -
    -
    - -

    -You can then use a command such as python -m calc.py to generate the tables. Alternatively, -a setup.py script, can import the module and use make_parser() to create the -parsing tables. -

    - -

    -If you're willing to sacrifice a little startup time, you can also instruct PLY to never write the -tables using yacc.yacc(write_tables=False, debug=False). In this mode, PLY will regenerate -the parsing tables from scratch each time. For a small grammar, you probably won't notice. For a -large grammar, you should probably reconsider--the parsing tables are meant to dramatically speed up this process. -

    - -

    -During operation, it is normal for PLY to produce diagnostic error -messages (usually printed to standard error). These are generated -entirely using the logging module. If you want to redirect -these messages or silence them, you can provide your own logging -object to yacc(). For example: -

    - -
    -
    -import logging
    -log = logging.getLogger('ply')
    -...
    -parser = yacc.yacc(errorlog=log)
    -
    -
    - -

    11. Where to go from here?

    - - -The examples directory of the PLY distribution contains several simple examples. Please consult a -compilers textbook for the theory and underlying implementation details or LR parsing. - - - - - - - - - - diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..91883c3 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,192 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + +clean: + rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/sly.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/sly.qhc" + +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/sly" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/sly" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..abfb05c --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,284 @@ +# -*- coding: utf-8 -*- +# +# ply documentation build configuration file, created by +# sphinx-quickstart on Wed Sep 7 13:23:26 2016. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os +import shlex + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'ply' +copyright = u'2001-2020, David Beazley' +author = u'David Beazley' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '4.0' +# The full version, including alpha/beta/rc tags. +release = '4.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' +#html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# Now only 'ja' uses this config value +#html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +#html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'plydoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', + +# Latex figure (float) alignment +#'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'ply.tex', u'Ply Documentation', + u'David Beazley', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'ply', u'Ply Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'ply', u'Ply Documentation', + author, 'ply', 'Python Lex-Yacc.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..da22efd --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,58 @@ +PLY (Python Lex-Yacc) +===================== + +Requirements +------------ + +PLY requires the use of Python 3.6 or greater. Older versions +of Python are not supported. + +Overview +-------- + +PLY is a 100% Python implementation of the lex and yacc tools +commonly used to write parsers and compilers. Parsing is +based on the same LALR(1) algorithm used by many yacc tools. +Here are a few notable features: + + - PLY provides *very* extensive error reporting and diagnostic + information to assist in parser construction. The original + implementation was developed for instructional purposes. As + a result, the system tries to identify the most common types + of errors made by novice users. + + - PLY provides full support for empty productions, error recovery, + precedence specifiers, and moderately ambiguous grammars. + + - PLY can be used to build parsers for "real" programming languages. + Although it is not ultra-fast due to its Python implementation, + PLY can be used to parse grammars consisting of several hundred + rules (as might be found for a language like C). + +More Documentation +================== + +Contents: + +.. toctree:: + :maxdepth: 3 + + ply + internals + +Resources +========= + +For a detailed overview of parsing theory, consult the excellent +book "Compilers : Principles, Techniques, and Tools" by Aho, Sethi, and +Ullman. The topics found in "Lex & Yacc" by Levine, Mason, and Brown +may also be useful. + +The GitHub page for PLY can be found at: + + https://github.com/dabeaz/ply + +Please direct bug reports and pull requests to the GitHub page. +To contact me directly, send email to dave@dabeaz.com or contact +me on Twitter (@dabeaz). + diff --git a/docs/internals.rst b/docs/internals.rst new file mode 100644 index 0000000..2e0de17 --- /dev/null +++ b/docs/internals.rst @@ -0,0 +1,530 @@ +PLY Internals +============= + +1. Introduction +--------------- + +This document describes classes and functions that make up the internal +operation of PLY. Using this programming interface, it is possible to +manually build an parser using a different interface specification +than what PLY normally uses. For example, you could build a gramar +from information parsed in a completely different input format. Some of +these objects may be useful for building more advanced parsing engines +such as GLR. + +It should be stressed that using PLY at this level is not for the +faint of heart. Generally, it's assumed that you know a bit of +the underlying compiler theory and how an LR parser is put together. + +2. Grammar Class +---------------- + +The file ``ply.yacc`` defines a class ``Grammar`` that +is used to hold and manipulate information about a grammar +specification. It encapsulates the same basic information +about a grammar that is put into a YACC file including +the list of tokens, precedence rules, and grammar rules. +Various operations are provided to perform different validations +on the grammar. In addition, there are operations to compute +the first and follow sets that are needed by the various table +generation algorithms. + + +``Grammar(terminals)`` + Creates a new grammar object. ``terminals`` is a list of strings + specifying the terminals for the grammar. An instance ``g`` of + ``Grammar`` has the following methods: + + +``g.set_precedence(term,assoc,level)`` + Sets the precedence level and associativity for a given terminal ``term``. + ``assoc`` is one of ``'right'``, + ``'left'``, or ``'nonassoc'`` and ``level`` is a positive integer. The higher + the value of ``level``, the higher the precedence. Here is an example of typical + precedence settings:: + + g.set_precedence('PLUS', 'left',1) + g.set_precedence('MINUS', 'left',1) + g.set_precedence('TIMES', 'left',2) + g.set_precedence('DIVIDE','left',2) + g.set_precedence('UMINUS','left',3) + + This method must be called prior to adding any productions to the + grammar with ``g.add_production()``. The precedence of individual grammar + rules is determined by the precedence of the right-most terminal. + + +``g.add_production(name,syms,func=None,file='',line=0)`` + Adds a new grammar rule. ``name`` is the name of the rule, + ``syms`` is a list of symbols making up the right hand + side of the rule, ``func`` is the function to call when + reducing the rule. ``file`` and ``line`` specify + the filename and line number of the rule and are used for + generating error messages. + + The list of symbols in ``syms`` may include character + literals and ``%prec`` specifiers. Here are some + examples:: + + g.add_production('expr',['expr','PLUS','term'],func,file,line) + g.add_production('expr',['expr','"+"','term'],func,file,line) + g.add_production('expr',['MINUS','expr','%prec','UMINUS'],func,file,line) + + If any kind of error is detected, a ``GrammarError`` exception + is raised with a message indicating the reason for the failure. + + +``g.set_start(start=None)`` + Sets the starting rule for the grammar. ``start`` is a string + specifying the name of the start rule. If ``start`` is omitted, + the first grammar rule added with ``add_production()`` is taken to be + the starting rule. This method must always be called after all + productions have been added. + +``g.find_unreachable()`` + Diagnostic function. Returns a list of all unreachable non-terminals + defined in the grammar. This is used to identify inactive parts of + the grammar specification. + +``g.infinite_cycle()`` + Diagnostic function. Returns a list of all non-terminals in the + grammar that result in an infinite cycle. This condition occurs if + there is no way for a grammar rule to expand to a string containing + only terminal symbols. + +``g.undefined_symbols()`` + Diagnostic function. Returns a list of tuples ``(name, prod)`` + corresponding to undefined symbols in the grammar. ``name`` is the + name of the undefined symbol and ``prod`` is an instance of + ``Production`` which has information about the production rule + where the undefined symbol was used. + +``g.unused_terminals()`` + Diagnostic function. Returns a list of terminals that were defined, + but never used in the grammar. + +``g.unused_rules()`` + Diagnostic function. Returns a list of ``Production`` instances + corresponding to production rules that were defined in the grammar, + but never used anywhere. This is slightly different + than ``find_unreachable()``. + +``g.unused_precedence()`` + Diagnostic function. Returns a list of tuples ``(term, assoc)`` + corresponding to precedence rules that were set, but never used the + grammar. ``term`` is the terminal name and ``assoc`` is the + precedence associativity (e.g., ``'left'``, ``'right'``, + or ``'nonassoc'``. + +``g.compute_first()`` + Compute all of the first sets for all symbols in the grammar. Returns a dictionary + mapping symbol names to a list of all first symbols. + +``g.compute_follow()`` + Compute all of the follow sets for all non-terminals in the grammar. + The follow set is the set of all possible symbols that might follow a + given non-terminal. Returns a dictionary mapping non-terminal names + to a list of symbols. + +``g.build_lritems()`` + Calculates all of the LR items for all productions in the grammar. This + step is required before using the grammar for any kind of table generation. + See the section on LR items below. + +The following attributes are set by the above methods and may be useful +in code that works with the grammar. All of these attributes should be +assumed to be read-only. Changing their values directly will likely +break the grammar. + +``g.Productions`` + A list of all productions added. The first entry is reserved for + a production representing the starting rule. The objects in this list + are instances of the ``Production`` class, described shortly. + +``g.Prodnames`` + A dictionary mapping the names of nonterminals to a list of all + productions of that nonterminal. + +``g.Terminals`` + A dictionary mapping the names of terminals to a list of the + production numbers where they are used. + +``g.Nonterminals`` + A dictionary mapping the names of nonterminals to a list of the + production numbers where they are used. + +``g.First`` + A dictionary representing the first sets for all grammar symbols. This is + computed and returned by the ``compute_first()`` method. + +``g.Follow`` + A dictionary representing the follow sets for all grammar rules. This is + computed and returned by the ``compute_follow()`` method. + +``g.Start`` + Starting symbol for the grammar. Set by the ``set_start()`` method. + +For the purposes of debugging, a ``Grammar`` object supports the ``__len__()`` and +``__getitem__()`` special methods. Accessing ``g[n]`` returns the nth production +from the grammar. + +3. Productions +-------------- + +``Grammar`` objects store grammar rules as instances of a ``Production`` class. This +class has no public constructor--you should only create productions by calling ``Grammar.add_production()``. +The following attributes are available on a ``Production`` instance ``p``. + +``p.name`` + The name of the production. For a grammar rule such as ``A : B C D``, this is ``'A'``. + +``p.prod`` + A tuple of symbols making up the right-hand side of the production. For a grammar rule such as ``A : B C D``, this is ``('B','C','D')``. + +``p.number`` + Production number. An integer containing the index of the production in the grammar's ``Productions`` list. + +``p.func`` + The name of the reduction function associated with the production. + This is the function that will execute when reducing the entire + grammar rule during parsing. + +``p.callable`` + The callable object associated with the name in ``p.func``. This is ``None`` + unless the production has been bound using ``bind()``. + +``p.file`` + Filename associated with the production. Typically this is the file where the production was defined. Used for error messages. + +``p.lineno`` + Line number associated with the production. Typically this is the line number in ``p.file`` where the production was defined. Used for error messages. + +``p.prec`` + Precedence and associativity associated with the production. This is a tuple ``(assoc,level)`` where + ``assoc`` is one of ``'left'``,``'right'``, or ``'nonassoc'`` and ``level`` is + an integer. This value is determined by the precedence of the right-most terminal symbol in the production + or by use of the ``%prec`` specifier when adding the production. + +``p.usyms`` + A list of all unique symbols found in the production. + +``p.lr_items`` + A list of all LR items for this production. This attribute only has a meaningful value if the + ``Grammar.build_lritems()`` method has been called. The items in this list are + instances of ``LRItem`` described below. + +``p.lr_next`` + The head of a linked-list representation of the LR items in ``p.lr_items``. + This attribute only has a meaningful value if the ``Grammar.build_lritems()`` + method has been called. Each ``LRItem`` instance has a ``lr_next`` attribute + to move to the next item. The list is terminated by ``None``. + +``p.bind(dict)`` + Binds the production function name in ``p.func`` to a callable object in + ``dict``. This operation is typically carried out in the last step + prior to running the parsing engine and is needed since parsing tables are typically + read from files which only include the function names, not the functions themselves. + +``Production`` objects support +the ``__len__()``, ``__getitem__()``, and ``__str__()`` +special methods. +``len(p)`` returns the number of symbols in ``p.prod`` +and ``p[n]`` is the same as ``p.prod[n]``. + +4. LRItems +---------- + +The construction of parsing tables in an LR-based parser generator is primarily +done over a set of "LR Items". An LR item represents a stage of parsing one +of the grammar rules. To compute the LR items, it is first necessary to +call ``Grammar.build_lritems()``. Once this step, all of the productions +in the grammar will have their LR items attached to them. + +Here is an interactive example that shows what LR items look like if you +interactively experiment. In this example, ``g`` is a ``Grammar`` +object:: + + >>> g.build_lritems() + >>> p = g[1] + >>> p + Production(statement -> ID = expr) + >>> + +In the above code, ``p`` represents the first grammar rule. In +this case, a rule ``'statement -> ID = expr'``. + +Now, let's look at the LR items for ``p``:: + + >>> p.lr_items + [LRItem(statement -> . ID = expr), + LRItem(statement -> ID . = expr), + LRItem(statement -> ID = . expr), + LRItem(statement -> ID = expr .)] + >>> + +In each LR item, the dot (.) represents a specific stage of parsing. In each LR item, the dot +is advanced by one symbol. It is only when the dot reaches the very end that a production +is successfully parsed. + +An instance ``lr`` of ``LRItem`` has the following +attributes that hold information related to that specific stage of +parsing. + +``lr.name`` + The name of the grammar rule. For example, ``'statement'`` in the above example. + +``lr.prod`` + A tuple of symbols representing the right-hand side of the production, including the + special ``'.'`` character. For example, ``('ID','.','=','expr')``. + +``lr.number`` + An integer representing the production number in the grammar. + +``lr.usyms`` + A set of unique symbols in the production. Inherited from the original ``Production`` instance. + +``lr.lr_index`` + An integer representing the position of the dot (.). You should never use ``lr.prod.index()`` + to search for it--the result will be wrong if the grammar happens to also use (.) as a character + literal. + +``lr.lr_after`` + A list of all productions that can legally appear immediately to the right of the + dot (.). This list contains ``Production`` instances. This attribute + represents all of the possible branches a parse can take from the current position. + For example, suppose that ``lr`` represents a stage immediately before + an expression like this:: + + >>> lr + LRItem(statement -> ID = . expr) + >>> + + Then, the value of ``lr.lr_after`` might look like this, showing all productions that + can legally appear next:: + + >>> lr.lr_after + [Production(expr -> expr PLUS expr), + Production(expr -> expr MINUS expr), + Production(expr -> expr TIMES expr), + Production(expr -> expr DIVIDE expr), + Production(expr -> MINUS expr), + Production(expr -> LPAREN expr RPAREN), + Production(expr -> NUMBER), + Production(expr -> ID)] + >>> + +``lr.lr_before`` + The grammar symbol that appears immediately before the dot (.) or ``None`` if + at the beginning of the parse. + +``lr.lr_next`` + A link to the next LR item, representing the next stage of the parse. ``None`` if ``lr`` + is the last LR item. + +``LRItem`` instances also support the ``__len__()`` and ``__getitem__()`` special methods. +``len(lr)`` returns the number of items in ``lr.prod`` including the dot (.). ``lr[n]`` +returns ``lr.prod[n]``. + +It goes without saying that all of the attributes associated with LR +items should be assumed to be read-only. Modifications will very +likely create a small black-hole that will consume you and your code. + +5. LRTable +---------- + +The ``LRTable`` class represents constructed LR parsing tables on a +grammar. + +``LRTable(grammar, log=None)`` + Create the LR parsing tables on a grammar. ``grammar`` is an instance of ``Grammar`` and + ``log`` is a logger object used to write debugging information. The debugging information + written to ``log`` is the same as what appears in the ``parser.out`` file created + by yacc. By supplying a custom logger with a different message format, it is possible to get + more information (e.g., the line number in ``yacc.py`` used for issuing each line of + output in the log). + +An instance ``lr`` of ``LRTable`` has the following attributes. + +``lr.grammar`` + A link to the Grammar object used to construct the parsing tables. + +``lr.lr_method`` + The LR parsing method used (e.g., ``'LALR'``) + +``lr.lr_productions`` + A reference to ``grammar.Productions``. This, together with ``lr_action`` and ``lr_goto`` + contain all of the information needed by the LR parsing engine. + +``lr.lr_action`` + The LR action dictionary that implements the underlying state machine. The keys of this dictionary are + the LR states. + +``lr.lr_goto`` + The LR goto table that contains information about grammar rule reductions. + +``lr.sr_conflicts`` + A list of tuples ``(state,token,resolution)`` identifying all shift/reduce conflicts. ``state`` is the LR state + number where the conflict occurred, ``token`` is the token causing the conflict, and ``resolution`` is + a string describing the resolution taken. ``resolution`` is either ``'shift'`` or ``'reduce'``. + +``lr.rr_conflicts`` + A list of tuples ``(state,rule,rejected)`` identifying all reduce/reduce conflicts. ``state`` is the + LR state number where the conflict occurred, ``rule`` is the production rule that was selected + and ``rejected`` is the production rule that was rejected. Both ``rule`` and ``rejected`` are + instances of ``Production``. They can be inspected to provide the user with more information. + +``lrtab.bind_callables(dict)`` + This binds all of the function names used in productions to callable objects + found in the dictionary ``dict``. During table generation and when reading + LR tables from files, PLY only uses the names of action functions such as ``'p_expr'``, + ``'p_statement'``, etc. In order to actually run the parser, these names + have to be bound to callable objects. This method is always called prior to + running a parser. + +6. LRParser +----------- + +The ``LRParser`` class implements the low-level LR parsing engine. + +``LRParser(lrtab, error_func)`` + Create an LRParser. ``lrtab`` is an instance of ``LRTable`` + containing the LR production and state tables. ``error_func`` is the + error function to invoke in the event of a parsing error. + +An instance ``p`` of ``LRParser`` has the following methods: + +``p.parse(input=None,lexer=None,debug=0,tracking=0)`` + Run the parser. ``input`` is a string, which if supplied is fed into the + lexer using its ``input()`` method. ``lexer`` is an instance of the + ``Lexer`` class to use for tokenizing. If not supplied, the last lexer + created with the ``lex`` module is used. ``debug`` is a boolean flag + that enables debugging. ``tracking`` is a boolean flag that tells the + parser to perform additional line number tracking. + +``p.restart()`` + Resets the parser state for a parse already in progress. + +7. ParserReflect +---------------- + +The ``ParserReflect`` class is used to collect parser specification data +from a Python module or object. This class is what collects all of the +``p_rule()`` functions in a PLY file, performs basic error checking, +and collects all of the needed information to build a grammar. Most of the +high-level PLY interface as used by the ``yacc()`` function is actually +implemented by this class. + +``ParserReflect(pdict, log=None)`` + Creates a ``ParserReflect`` instance. ``pdict`` is a dictionary + containing parser specification data. This dictionary typically corresponds + to the module or class dictionary of code that implements a PLY parser. + ``log`` is a logger instance that will be used to report error + messages. + +An instance ``p`` of ``ParserReflect`` has the following methods: + +``p.get_all()`` + Collect and store all required parsing information. + +``p.validate_all()`` + Validate all of the collected parsing information. This is a seprate step + from ``p.get_all()`` as a performance optimization. In order to + increase parser start-up time, a parser can elect to only validate the + parsing data when regenerating the parsing tables. The validation + step tries to collect as much information as possible rather than + raising an exception at the first sign of trouble. The attribute + ``p.error`` is set if there are any validation errors. The + value of this attribute is also returned. + +``p.signature()`` + Compute a signature representing the contents of the collected parsing + data. The signature value should change if anything in the parser + specification has changed in a way that would justify parser table + regeneration. This method can be called after ``p.get_all()``, + but before ``p.validate_all()``. + +The following attributes are set in the process of collecting data: + +``p.start`` + The grammar start symbol, if any. Taken from ``pdict['start']``. + +``p.error_func`` + The error handling function or ``None``. Taken from ``pdict['p_error']``. + +``p.tokens`` + The token list. Taken from ``pdict['tokens']``. + +``p.prec`` + The precedence specifier. Taken from ``pdict['precedence']``. + +``p.preclist`` + A parsed version of the precedence specified. A list of tuples of the form + ``(token,assoc,level)`` where ``token`` is the terminal symbol, + ``assoc`` is the associativity (e.g., ``'left'``) and ``level`` + is a numeric precedence level. + +``p.grammar`` + A list of tuples ``(name, rules)`` representing the grammar rules. ``name`` is the + name of a Python function or method in ``pdict`` that starts with ``"p_"``. + ``rules`` is a list of tuples ``(filename,line,prodname,syms)`` representing + the grammar rules found in the documentation string of that function. ``filename`` and ``line`` contain location + information that can be used for debugging. ``prodname`` is the name of the + production. ``syms`` is the right-hand side of the production. If you have a + function like this:: + + def p_expr(p): + '''expr : expr PLUS expr + | expr MINUS expr + | expr TIMES expr + | expr DIVIDE expr''' + + then the corresponding entry in ``p.grammar`` might look like this:: + + ('p_expr', [ ('calc.py',10,'expr', ['expr','PLUS','expr']), + ('calc.py',11,'expr', ['expr','MINUS','expr']), + ('calc.py',12,'expr', ['expr','TIMES','expr']), + ('calc.py',13,'expr', ['expr','DIVIDE','expr']) + ]) + +``p.pfuncs`` + A sorted list of tuples ``(line, file, name, doc)`` representing all of + the ``p_`` functions found. ``line`` and ``file`` give location + information. ``name`` is the name of the function. ``doc`` is the + documentation string. This list is sorted in ascending order by line number. + +``p.files`` + A dictionary holding all of the source filenames that were encountered + while collecting parser information. Only the keys of this dictionary have + any meaning. + +``p.error`` + An attribute that indicates whether or not any critical errors + occurred in validation. If this is set, it means that that some kind + of problem was detected and that no further processing should be + performed. + +8. High-level operation +----------------------- + +Using all of the above classes requires some attention to detail. The ``yacc()`` +function carries out a very specific sequence of operations to create a grammar. +This same sequence should be emulated if you build an alternative PLY interface. + + +1. A ``ParserReflect`` object is created and raw grammar specification data is +collected. + +2. A ``Grammar`` object is created and populated with information +from the specification data. + +3. A ``LRTable`` object is created to run the LALR algorithm over +the ``Grammar`` object. + +4. Productions in the LRTable and bound to callables using the ``bind_callables()`` +method. + +5. A ``LRParser`` object is created from from the information in the +``LRTable`` object. + + + diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..474c9bd --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,263 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + echo. coverage to run coverage check of the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +REM Check if sphinx-build is available and fallback to Python version if any +%SPHINXBUILD% 2> nul +if errorlevel 9009 goto sphinx_python +goto sphinx_ok + +:sphinx_python + +set SPHINXBUILD=python -m sphinx.__init__ +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +:sphinx_ok + + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\sly.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\sly.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "coverage" ( + %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage + if errorlevel 1 exit /b 1 + echo. + echo.Testing of coverage in the sources finished, look at the ^ +results in %BUILDDIR%/coverage/python.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +:end diff --git a/docs/ply.rst b/docs/ply.rst new file mode 100644 index 0000000..2f6a89a --- /dev/null +++ b/docs/ply.rst @@ -0,0 +1,2656 @@ +PLY (Python Lex-Yacc) +===================== + +This document provides an overview of lexing and parsing with PLY. +Given the intrinsic complexity of parsing, I strongly advise +that you read (or at least skim) this entire document before jumping +into a big development project with PLY. + +PLY-4.0 requires Python 3.6 or newer. If you're using an older version +of Python, you're out of luck. Sorry. + +Introduction +------------ + +PLY is a pure-Python implementation of the compiler +construction tools lex and yacc. The main goal of PLY is to stay +fairly faithful to the way in which traditional lex/yacc tools work. +This includes supporting LALR(1) parsing as well as providing +extensive input validation, error reporting, and diagnostics. Thus, +if you've used yacc in another programming language, it should be +relatively straightforward to use PLY. + +Early versions of PLY were developed to support an Introduction to +Compilers Course I taught in 2001 at the University of Chicago. Since +PLY was primarily developed as an instructional tool, you will find it +to be fairly picky about token and grammar rule specification. In +part, this added formality is meant to catch common programming +mistakes made by novice users. However, advanced users will also find +such features to be useful when building complicated grammars for real +programming languages. It should also be noted that PLY does not +provide much in the way of bells and whistles (e.g., automatic +construction of abstract syntax trees, tree traversal, etc.). Nor +would I consider it to be a parsing framework. Instead, you will find +a bare-bones, yet fully capable lex/yacc implementation written +entirely in Python. + +The rest of this document assumes that you are somewhat familiar with +parsing theory, syntax directed translation, and the use of compiler +construction tools such as lex and yacc in other programming +languages. If you are unfamiliar with these topics, you will probably +want to consult an introductory text such as "Compilers: Principles, +Techniques, and Tools", by Aho, Sethi, and Ullman. O'Reilly's "Lex +and Yacc" by John Levine may also be handy. In fact, the O'Reilly +book can be used as a reference for PLY as the concepts are virtually +identical. + +PLY Overview +------------ + +PLY consists of two separate modules; ``lex.py`` and ``yacc.py``, both +of which are found in a Python package called ``ply``. The ``lex.py`` +module is used to break input text into a collection of tokens +specified by a collection of regular expression rules. ``yacc.py`` is +used to recognize language syntax that has been specified in the form +of a context free grammar. + +The two tools are meant to work together. Specifically, ``lex.py`` +provides an interface to produce tokens. ``yacc.py`` uses this +retrieve tokens and invoke grammar rules. The output of ``yacc.py`` +is often an Abstract Syntax Tree (AST). However, this is entirely up +to the user. If desired, ``yacc.py`` can also be used to implement +simple one-pass compilers. + +Like its Unix counterpart, ``yacc.py`` provides most of the features +you expect including extensive error checking, grammar validation, +support for empty productions, error tokens, and ambiguity resolution +via precedence rules. In fact, almost everything that is possible in +traditional yacc should be supported in PLY. + +The primary difference between ``yacc.py`` and Unix ``yacc`` is that +``yacc.py`` doesn't involve a separate code-generation process. +Instead, PLY relies on reflection (introspection) to build its lexers +and parsers. Unlike traditional lex/yacc which require a special +input file that is converted into a separate source file, the +specifications given to PLY *are* valid Python programs. This +means that there are no extra source files nor is there a special +compiler construction step (e.g., running yacc to generate Python code +for the compiler). + +Lex +--- + +``lex.py`` is used to tokenize an input string. For example, suppose +you're writing a programming language and a user supplied the +following input string:: + + x = 3 + 42 * (s - t) + +A tokenizer splits the string into individual tokens:: + + 'x','=', '3', '+', '42', '*', '(', 's', '-', 't', ')' + +Tokens are usually given names to indicate what they are. For example:: + + 'ID','EQUALS','NUMBER','PLUS','NUMBER','TIMES', + 'LPAREN','ID','MINUS','ID','RPAREN' + +More specifically, the input is broken into pairs of token types and +values. For example:: + + ('ID','x'), ('EQUALS','='), ('NUMBER','3'), + ('PLUS','+'), ('NUMBER','42), ('TIMES','*'), + ('LPAREN','('), ('ID','s'), ('MINUS','-'), + ('ID','t'), ('RPAREN',')' + +The specification of tokens is done by writing a series of +regular expression rules. The next section shows how this is done +using ``lex.py``. + +Lex Example +^^^^^^^^^^^ + +The following example shows how ``lex.py`` is used to write a simple tokenizer:: + + # ------------------------------------------------------------ + # calclex.py + # + # tokenizer for a simple expression evaluator for + # numbers and +,-,*,/ + # ------------------------------------------------------------ + import ply.lex as lex + + # List of token names. This is always required + tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', + ) + + # Regular expression rules for simple tokens + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + + # A regular expression rule with some action code + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + + # Define a rule so we can track line numbers + def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + + # A string containing ignored characters (spaces and tabs) + t_ignore = ' \t' + + # Error handling rule + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + # Build the lexer + lexer = lex.lex() + +To use the lexer, you first need to feed it some input text using +its ``input()`` method. After that, repeated calls +to ``token()`` produce tokens. The following code shows how this +works:: + + # Test it out + data = ''' + 3 + 4 * 10 + + -20 *2 + ''' + + # Give the lexer some input + lexer.input(data) + + # Tokenize + while True: + tok = lexer.token() + if not tok: + break # No more input + print(tok) + +When executed, the example will produce the following output:: + + $ python example.py + LexToken(NUMBER,3,2,1) + LexToken(PLUS,'+',2,3) + LexToken(NUMBER,4,2,5) + LexToken(TIMES,'*',2,7) + LexToken(NUMBER,10,2,10) + LexToken(PLUS,'+',3,14) + LexToken(MINUS,'-',3,16) + LexToken(NUMBER,20,3,18) + LexToken(TIMES,'*',3,20) + LexToken(NUMBER,2,3,21) + +Lexers also support the iteration protocol. So, you can write the +above loop as follows:: + + for tok in lexer: + print(tok) + +The tokens returned by ``lexer.token()`` are instances of +``LexToken``. This object has attributes ``type``, ``value``, +``lineno``, and ``lexpos``. The following code shows an +example of accessing these attributes:: + + # Tokenize + while True: + tok = lexer.token() + if not tok: + break # No more input + print(tok.type, tok.value, tok.lineno, tok.lexpos) + +The ``type`` and ``value`` attributes contain the type and +value of the token itself. ``lineno`` and ``lexpos`` contain +information about the location of the token. ``lexpos`` is the +index of the token relative to the start of the input text. + +The tokens list +^^^^^^^^^^^^^^^ + +All lexers must provide a list ``tokens`` that defines all of the +possible token names that can be produced by the lexer. This list is +always required and is used to perform a variety of validation checks. +The tokens list is also used by the ``yacc.py`` module to identify +terminals. + +In the example, the following code specified the token names:: + + tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', + ) + +Specification of tokens +^^^^^^^^^^^^^^^^^^^^^^^ + +Each token is specified by writing a regular expression rule +compatible with Python's ``re`` module. Each of these rules are +defined by making declarations with a special prefix ``t_`` to +indicate that it defines a token. For simple tokens, the regular +expression can be specified as strings such as this (note: Python raw +strings are used since they are the most convenient way to write +regular expression strings):: + + t_PLUS = r'\+' + +In this case, the name following the ``t_`` must exactly match one of +the names supplied in ``tokens``. If some kind of action needs to be +performed, a token rule can be specified as a function. For example, +this rule matches numbers and converts the string into a Python +integer:: + + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + +When a function is used, the regular expression rule is specified in +the function documentation string. The function always takes a single +argument which is an instance of ``LexToken``. This object has +attributes of ``type`` which is the token type (as a string), +``value`` which is the lexeme (the actual text matched), +``lineno`` which is the current line number, and ``lexpos`` which +is the position of the token relative to the beginning of the input +text. By default, ``type`` is set to the name following the ``t_`` +prefix. The action function can modify the contents of the +``LexToken`` object as appropriate. However, when it is done, the +resulting token should be returned. If no value is returned by the +action function, the token is discarded and the next token +read. + +Internally, ``lex.py`` uses the ``re`` module to do its pattern +matching. Patterns are compiled using the ``re.VERBOSE`` flag which +can be used to help readability. However, be aware that unescaped +whitespace is ignored and comments are allowed in this mode. If your +pattern involves whitespace, make sure you use ``\s``. If you need to +match the ``#`` character, use ``[#]``. + +When building the master regular expression, rules are added in the +following order: + +1. All tokens defined by functions are added in the same order as they + appear in the lexer file. + +2. Tokens defined by strings are added next by sorting them in order + of decreasing regular expression length (longer expressions are added + first). + +Without this ordering, it can be difficult to correctly match certain +types of tokens. For example, if you wanted to have separate tokens +for "=" and "==", you need to make sure that "==" is checked first. +By sorting regular expressions in order of decreasing length, this +problem is solved for rules defined as strings. For functions, the +order can be explicitly controlled since rules appearing first are +checked first. + +To handle reserved words, you should write a single rule to match an +identifier and do a special name lookup in a function like this:: + + reserved = { + 'if' : 'IF', + 'then' : 'THEN', + 'else' : 'ELSE', + 'while' : 'WHILE', + ... + } + + tokens = ['LPAREN','RPAREN',...,'ID'] + list(reserved.values()) + + def t_ID(t): + r'[a-zA-Z_][a-zA-Z_0-9]*' + t.type = reserved.get(t.value,'ID') # Check for reserved words + return t + +This approach greatly reduces the number of regular expression rules +and is likely to make things a little faster. + +Note: You should avoid writing individual rules for reserved words. +For example, if you write rules like this:: + + t_FOR = r'for' + t_PRINT = r'print' + +those rules will be triggered for identifiers that include those words +as a prefix such as "forget" or "printed". This is probably not what +you want. + +Token values +^^^^^^^^^^^^ + +When tokens are returned by lex, they have a value that is stored in +the ``value`` attribute. Normally, the value is the text that was +matched. However, the value can be assigned to any Python object. +For instance, when lexing identifiers, you may want to return both the +identifier name and information from some sort of symbol table. To do +this, you might write a rule like this:: + + def t_ID(t): + ... + # Look up symbol table information and return a tuple + t.value = (t.value, symbol_lookup(t.value)) + ... + return t + +It is important to note that storing data in other attribute names is +*not* recommended. The ``yacc.py`` module only exposes the +contents of the ``value`` attribute. Thus, accessing other attributes +may be unnecessarily awkward. If you need to store multiple values on +a token, assign a tuple, dictionary, or instance to ``value``. + +Discarded tokens +^^^^^^^^^^^^^^^^ + +To discard a token, such as a comment, define a token rule that +returns no value. For example:: + + def t_COMMENT(t): + r'\#.*' + pass + # No return value. Token discarded + +Alternatively, you can include the prefix ``ignore_`` in the token +declaration to force a token to be ignored. For example:: + + t_ignore_COMMENT = r'\#.*' + +Be advised that if you are ignoring many different kinds of text, you +may still want to use functions since these provide more precise +control over the order in which regular expressions are matched (i.e., +functions are matched in order of specification whereas strings are +sorted by regular expression length). + +Line numbers and positional information +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +By default, ``lex.py`` knows nothing about line numbers. This is +because ``lex.py`` doesn't know anything about what constitutes a +"line" of input (e.g., the newline character or even if the input is +textual data). To update this information, you need to write a +special rule. In the example, the ``t_newline()`` rule shows how to +do this:: + + # Define a rule so we can track line numbers + def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + +Within the rule, the ``lineno`` attribute of the underlying lexer +``t.lexer`` is updated. After the line number is updated, the token +is discarded since nothing is returned. + +``lex.py`` does not perform any kind of automatic column tracking. +However, it does record positional information related to each token +in the ``lexpos`` attribute. Using this, it is usually possible to +compute column information as a separate step. For instance, just +count backwards until you reach a newline:: + + # Compute column. + # input is the input text string + # token is a token instance + def find_column(input, token): + line_start = input.rfind('\n', 0, token.lexpos) + 1 + return (token.lexpos - line_start) + 1 + +Since column information is often only useful in the context of error +handling, calculating the column position can be performed when needed +as opposed to doing it for each token. Note: If you're parsing a language +where whitespace matters (i.e., Python), it's probably better match +whitespace as a token instead of ignoring it. + +Ignored characters +^^^^^^^^^^^^^^^^^^ + +The special ``t_ignore`` rule is reserved by ``lex.py`` for characters +that should be completely ignored in the input stream. Usually this +is used to skip over whitespace and other non-essential characters. +Although it is possible to define a regular expression rule for +whitespace in a manner similar to ``t_newline()``, the use of +``t_ignore`` provides substantially better lexing performance because +it is handled as a special case and is checked in a much more +efficient manner than the normal regular expression rules. + +The characters given in ``t_ignore`` are not ignored when such +characters are part of other regular expression patterns. For +example, if you had a rule to capture quoted text, that pattern can +include the ignored characters (which will be captured in the normal +way). The main purpose of ``t_ignore`` is to ignore whitespace and +other padding between the tokens that you actually want to parse. + +Literal characters +^^^^^^^^^^^^^^^^^^ + +Literal characters can be specified by defining a variable +``literals`` in your lexing module. For example:: + + literals = [ '+','-','*','/' ] + +or alternatively:: + + literals = "+-*/" + +A literal character is a single character that is returned "as +is" when encountered by the lexer. Literals are checked after all of +the defined regular expression rules. Thus, if a rule starts with one +of the literal characters, it will always take precedence. + +When a literal token is returned, both its ``type`` and ``value`` +attributes are set to the character itself. For example, ``'+'``. + +It's possible to write token functions that perform additional actions +when literals are matched. However, you'll need to set the token type +appropriately. For example:: + + literals = [ '{', '}' ] + + def t_lbrace(t): + r'\{' + t.type = '{' # Set token type to the expected literal + return t + + def t_rbrace(t): + r'\}' + t.type = '}' # Set token type to the expected literal + return t + +Error handling +^^^^^^^^^^^^^^ + +The ``t_error()`` function is used to handle lexing errors that occur +when illegal characters are detected. In this case, the ``t.value`` +attribute contains the rest of the input string that has not been +tokenized. In the example, the error function was defined as +follows:: + + # Error handling rule + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +In this case, we print the offending character and skip ahead +one character by calling ``t.lexer.skip(1)``. + +EOF Handling +^^^^^^^^^^^^ + +The ``t_eof()`` function is used to handle an end-of-file (EOF) +condition in the input. As input, it receives a token type ``'eof'`` +with the ``lineno`` and ``lexpos`` attributes set appropriately. The +main use of this function is provide more input to the lexer so that +it can continue to parse. Here is an example of how this works:: + + # EOF handling rule + def t_eof(t): + # Get more input (Example) + more = raw_input('... ') + if more: + self.lexer.input(more) + return self.lexer.token() + return None + +The EOF function should return the next available token (by calling +``self.lexer.token())`` or ``None`` to indicate no more data. Be +aware that setting more input with the ``self.lexer.input()`` method +does NOT reset the lexer state or the ``lineno`` attribute used for +position tracking. The ``lexpos`` attribute is reset so be aware of +that if you're using it in error reporting. + +Building and using the lexer +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To build the lexer, the function ``lex.lex()`` is used. For example:: + + lexer = lex.lex() + +This function uses Python reflection (or introspection) to read the +regular expression rules out of the calling context and build the +lexer. Once the lexer has been built, two methods can be used to +control the lexer. + +``lexer.input(data)``. Reset the lexer and store a new input string. + +``lexer.token()``. Return the next token. Returns a special +``LexToken`` instance on success or None if the end of the input text +has been reached. + +The @TOKEN decorator +^^^^^^^^^^^^^^^^^^^^ + +In some applications, you may want to define tokens as a series of +more complex regular expression rules. For example:: + + digit = r'([0-9])' + nondigit = r'([_A-Za-z])' + identifier = r'(' + nondigit + r'(' + digit + r'|' + nondigit + r')*)' + + def t_ID(t): + # want docstring to be identifier above. ????? + ... + +In this case, we want the regular expression rule for ``ID`` to be one +of the variables above. However, there is no way to directly specify +this using a normal documentation string. To solve this problem, you +can use the ``@TOKEN`` decorator. For example:: + + from ply.lex import TOKEN + + @TOKEN(identifier) + def t_ID(t): + ... + +This will attach ``identifier`` to the docstring for ``t_ID()`` +allowing ``lex.py`` to work normally. Naturally, you could use ``@TOKEN`` +on all functions as an alternative to using doc-strings. + +Debugging +^^^^^^^^^ + +For the purpose of debugging, you can run ``lex()`` in a debugging +mode as follows:: + + lexer = lex.lex(debug=True) + +This will produce various sorts of debugging information including all +of the added rules, the master regular expressions used by the lexer, +and tokens generating during lexing. + +In addition, ``lex.py`` comes with a simple main function which will +either tokenize input read from standard input or from a file +specified on the command line. To use it, put this in your +lexer:: + + if __name__ == '__main__': + lex.runmain() + +Please refer to the "Debugging" section near the end for some more +advanced details of debugging. + +Alternative specification of lexers +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +As shown in the example, lexers are specified all within one Python +module. If you want to put token rules in a different module from the +one in which you invoke ``lex()``, use the ``module`` keyword +argument. + +For example, you might have a dedicated module that just contains the +token rules:: + + # module: tokrules.py + # This module just contains the lexing rules + + # List of token names. This is always required + tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', + ) + + # Regular expression rules for simple tokens + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + + # A regular expression rule with some action code + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + + # Define a rule so we can track line numbers + def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + + # A string containing ignored characters (spaces and tabs) + t_ignore = ' \t' + + # Error handling rule + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +Now, if you wanted to build a tokenizer from these rules from within a +different module, you would do the following (shown for Python +interactive mode):: + + >>> import tokrules + >>> lexer = lex.lex(module=tokrules) + >>> lexer.input("3 + 4") + >>> lexer.token() + LexToken(NUMBER,3,1,1,0) + >>> lexer.token() + LexToken(PLUS,'+',1,2) + >>> lexer.token() + LexToken(NUMBER,4,1,4) + >>> lexer.token() + None + >>> + +The ``module`` option can also be used to define lexers from instances +of a class. For example:: + + import ply.lex as lex + + class MyLexer(object): + # List of token names. This is always required + tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', + ) + + # Regular expression rules for simple tokens + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + + # A regular expression rule with some action code + # Note addition of self parameter since we're in a class + def t_NUMBER(self,t): + r'\d+' + t.value = int(t.value) + return t + + # Define a rule so we can track line numbers + def t_newline(self,t): + r'\n+' + t.lexer.lineno += len(t.value) + + # A string containing ignored characters (spaces and tabs) + t_ignore = ' \t' + + # Error handling rule + def t_error(self,t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + # Build the lexer + def build(self,**kwargs): + self.lexer = lex.lex(module=self, **kwargs) + + # Test it output + def test(self,data): + self.lexer.input(data) + while True: + tok = self.lexer.token() + if not tok: + break + print(tok) + + # Build the lexer and try it out + m = MyLexer() + m.build() # Build the lexer + m.test("3 + 4") # Test it + + +When building a lexer from class, *you should construct the lexer +from an instance of the class*, not the class object itself. This +is because PLY only works properly if the lexer actions are defined by +bound-methods. + +When using the ``module`` option to ``lex()``, PLY collects symbols +from the underlying object using the ``dir()`` function. There is no +direct access to the ``__dict__`` attribute of the object supplied as +a module value. + +Finally, if you want to keep things nicely encapsulated, but don't +want to use a full-fledged class definition, lexers can be defined +using closures. For example:: + + import ply.lex as lex + + # List of token names. This is always required + tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', + ) + + def MyLexer(): + # Regular expression rules for simple tokens + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + + # A regular expression rule with some action code + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + + # Define a rule so we can track line numbers + def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + + # A string containing ignored characters (spaces and tabs) + t_ignore = ' \t' + + # Error handling rule + def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + + # Build the lexer from my environment and return it + return lex.lex() + +Important note: If you are defining a lexer using a class or closure, +be aware that PLY still requires you to only define a single lexer per +module (source file). There are extensive validation/error checking +parts of the PLY that may falsely report error messages if you don't +follow this rule. + +Maintaining state +^^^^^^^^^^^^^^^^^ + +In your lexer, you may want to maintain a variety of state +information. This might include mode settings, symbol tables, and +other details. As an example, suppose that you wanted to keep track +of how many NUMBER tokens had been encountered. + +One way to do this is to keep a set of global variables in the module +where you created the lexer. For example:: + + num_count = 0 + def t_NUMBER(t): + r'\d+' + global num_count + num_count += 1 + t.value = int(t.value) + return t + +If you don't like the use of a global variable, another place to store +information is inside the Lexer object created by ``lex()``. To this, +you can use the ``lexer`` attribute of tokens passed to the various +rules. For example:: + + def t_NUMBER(t): + r'\d+' + t.lexer.num_count += 1 # Note use of lexer attribute + t.value = int(t.value) + return t + + lexer = lex.lex() + lexer.num_count = 0 # Set the initial count + +This latter approach has the advantage of being simple and working +correctly in applications where multiple instantiations of a given +lexer exist in the same application. However, this might also feel +like a gross violation of encapsulation to OO purists. Just to put +your mind at some ease, all internal attributes of the lexer (with the +exception of ``lineno``) have names that are prefixed by ``lex`` +(e.g., ``lexdata``,``lexpos``, etc.). Thus, it is perfectly safe to +store attributes in the lexer that don't have names starting with that +prefix or a name that conflicts with one of the predefined methods +(e.g., ``input()``, ``token()``, etc.). + +If you don't like assigning values on the lexer object, you can define +your lexer as a class as shown in the previous section:: + + class MyLexer: + ... + def t_NUMBER(self,t): + r'\d+' + self.num_count += 1 + t.value = int(t.value) + return t + + def build(self, **kwargs): + self.lexer = lex.lex(object=self,**kwargs) + + def __init__(self): + self.num_count = 0 + +The class approach may be the easiest to manage if your application is +going to be creating multiple instances of the same lexer and you need +to manage a lot of state. + +State can also be managed through closures. For example:: + + def MyLexer(): + num_count = 0 + ... + def t_NUMBER(t): + r'\d+' + nonlocal num_count + num_count += 1 + t.value = int(t.value) + return t + ... + +Lexer cloning +^^^^^^^^^^^^^ + +If necessary, a lexer object can be duplicated by invoking its +``clone()`` method. For example:: + + lexer = lex.lex() + ... + newlexer = lexer.clone() + +When a lexer is cloned, the copy is exactly identical to the original +lexer including any input text and internal state. However, the clone +allows a different set of input text to be supplied which may be +processed separately. This may be useful in situations when you are +writing a parser/compiler that involves recursive or reentrant +processing. For instance, if you needed to scan ahead in the input +for some reason, you could create a clone and use it to look ahead. +Or, if you were implementing some kind of preprocessor, cloned lexers +could be used to handle different input files. + +Creating a clone is different than calling ``lex.lex()`` in that +PLY doesn't regenerate any of the internal tables or regular expressions. + +Special considerations need to be made when cloning lexers that also +maintain their own internal state using classes or closures. Namely, +you need to be aware that the newly created lexers will share all of +this state with the original lexer. For example, if you defined a +lexer as a class and did this:: + + m = MyLexer() + a = lex.lex(object=m) # Create a lexer + + b = a.clone() # Clone the lexer + +Then both ``a`` and ``b`` are going to be bound to the same object +``m`` and any changes to ``m`` will be reflected in both lexers. It's +important to emphasize that ``clone()`` is only meant to create a new +lexer that reuses the regular expressions and environment of another +lexer. If you need to make a totally new copy of a lexer, then call +``lex()`` again. + +Internal lexer state +^^^^^^^^^^^^^^^^^^^^ + +A Lexer object ``lexer`` has a number of internal attributes that may be useful in certain +situations. + +``lexer.lexpos`` + This attribute is an integer that contains the current position + within the input text. If you modify the value, it will change + the result of the next call to ``token()``. Within token rule + functions, this points to the first character *after* the + matched text. If the value is modified within a rule, the next + returned token will be matched at the new position. + +``lexer.lineno`` + The current value of the line number attribute stored in the + lexer. PLY only specifies that the attribute exists---it never + sets, updates, or performs any processing with it. If you want to + track line numbers, you will need to add code yourself (see the + section on line numbers and positional information). + +``lexer.lexdata`` + The current input text stored in the lexer. This is the string + passed with the ``input()`` method. It would probably be a bad + idea to modify this unless you really know what you're doing. + +``lexer.lexmatch`` + This is the raw ``Match`` object returned by the Python + ``re.match()`` function (used internally by PLY) for the current + token. If you have written a regular expression that contains + named groups, you can use this to retrieve those values. Note: + This attribute is only updated when tokens are defined and + processed by functions. + +Conditional lexing and start conditions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In advanced parsing applications, it may be useful to have different +lexing states. For instance, you may want the occurrence of a certain +token or syntactic construct to trigger a different kind of lexing. +PLY supports a feature that allows the underlying lexer to be put into +a series of different states. Each state can have its own tokens, +lexing rules, and so forth. The implementation is based largely on +the "start condition" feature of GNU flex. Details of this can be +found at http://flex.sourceforge.net/manual/Start-Conditions.html + +To define a new lexing state, it must first be declared. This is done +by including a "states" declaration in your lex file. For example:: + + states = ( + ('foo','exclusive'), + ('bar','inclusive'), + ) + +This declaration declares two states, ``'foo'`` and ``'bar'``. States +may be of two types; ``'exclusive'`` and ``'inclusive'``. An +exclusive state completely overrides the default behavior of the +lexer. That is, lex will only return tokens and apply rules defined +specifically for that state. An inclusive state adds additional +tokens and rules to the default set of rules. Thus, lex will return +both the tokens defined by default in addition to those defined for +the inclusive state. + +Once a state has been declared, tokens and rules are declared by +including the state name in token/rule declaration. For example:: + + t_foo_NUMBER = r'\d+' # Token 'NUMBER' in state 'foo' + t_bar_ID = r'[a-zA-Z_][a-zA-Z0-9_]*' # Token 'ID' in state 'bar' + + def t_foo_newline(t): + r'\n' + t.lexer.lineno += 1 + +A token can be declared in multiple states by including multiple state +names in the declaration. For example:: + + t_foo_bar_NUMBER = r'\d+' # Defines token 'NUMBER' in both state 'foo' and 'bar' + +Alternative, a token can be declared in all states using the 'ANY' in +the name:: + + t_ANY_NUMBER = r'\d+' # Defines a token 'NUMBER' in all states + +If no state name is supplied, as is normally the case, the token is +associated with a special state ``'INITIAL'``. For example, these two +declarations are identical:: + + t_NUMBER = r'\d+' + t_INITIAL_NUMBER = r'\d+' + + +States are also associated with the special ``t_ignore``, +``t_error()``, and ``t_eof()`` declarations. For example, if a state +treats these differently, you can declare:: + + t_foo_ignore = " \t\n" # Ignored characters for state 'foo' + + def t_bar_error(t): # Special error handler for state 'bar' + pass + +By default, lexing operates in the ``'INITIAL'`` state. This state +includes all of the normally defined tokens. For users who aren't +using different states, this fact is completely transparent. If, +during lexing or parsing, you want to change the lexing state, use the +``begin()`` method. For example:: + + def t_begin_foo(t): + r'start_foo' + t.lexer.begin('foo') # Starts 'foo' state + +To get out of a state, you use ``begin()`` to switch back to the +initial state. For example:: + + def t_foo_end(t): + r'end_foo' + t.lexer.begin('INITIAL') # Back to the initial state + +The management of states can also be done with a stack. For example:: + + def t_begin_foo(t): + r'start_foo' + t.lexer.push_state('foo') # Starts 'foo' state + + def t_foo_end(t): + r'end_foo' + t.lexer.pop_state() # Back to the previous state + + +The use of a stack would be useful in situations where there are many +ways of entering a new lexing state and you merely want to go back to +the previous state afterwards. + +An example might help clarify. Suppose you were writing a parser and +you wanted to grab sections of arbitrary C code enclosed by curly +braces. That is, whenever you encounter a starting brace ``'{'``, you +want to read all of the enclosed code up to the ending brace ``'}'`` and +return it as a string. Doing this with a normal regular expression +rule is nearly (if not actually) impossible. This is because braces +can be nested and can be included in comments and strings. Thus, +matching up to the first matching ``'}'`` character isn't good +enough. Here is how you might use lexer states to do this:: + + # Declare the state + states = ( + ('ccode','exclusive'), + ) + + # Match the first {. Enter ccode state. + def t_ccode(t): + r'\{' + t.lexer.code_start = t.lexer.lexpos # Record the starting position + t.lexer.level = 1 # Initial brace level + t.lexer.begin('ccode') # Enter 'ccode' state + + # Rules for the ccode state + def t_ccode_lbrace(t): + r'\{' + t.lexer.level +=1 + + def t_ccode_rbrace(t): + r'\}' + t.lexer.level -=1 + + # If closing brace, return the code fragment + if t.lexer.level == 0: + t.value = t.lexer.lexdata[t.lexer.code_start:t.lexer.lexpos+1] + t.type = "CCODE" + t.lexer.lineno += t.value.count('\n') + t.lexer.begin('INITIAL') + return t + + # C or C++ comment (ignore) + def t_ccode_comment(t): + r'(/\*(.|\n)*?\*/)|(//.*)' + pass + + # C string + def t_ccode_string(t): + r'\"([^\\\n]|(\\.))*?\"' + + # C character literal + def t_ccode_char(t): + r'\'([^\\\n]|(\\.))*?\'' + + # Any sequence of non-whitespace characters (not braces, strings) + def t_ccode_nonspace(t): + r'[^\s\{\}\'\"]+' + + # Ignored characters (whitespace) + t_ccode_ignore = " \t\n" + + # For bad characters, we just skip over it + def t_ccode_error(t): + t.lexer.skip(1) + +In this example, the occurrence of the first '{' causes the lexer to +record the starting position and enter a new state ``'ccode'``. A +collection of rules then match various parts of the input that follow +(comments, strings, etc.). All of these rules merely discard the +token (by not returning a value). However, if the closing right brace +is encountered, the rule ``t_ccode_rbrace`` collects all of the code +(using the earlier recorded starting position), stores it, and returns +a token 'CCODE' containing all of that text. When returning the +token, the lexing state is restored back to its initial state. + +Miscellaneous Issues +^^^^^^^^^^^^^^^^^^^^ + +- The lexer requires input to be supplied as a single input string. + Since most machines have more than enough memory, this rarely presents + a performance concern. However, it means that the lexer currently + can't be used with streaming data such as open files or sockets. This + limitation is primarily a side-effect of using the ``re`` module. You + might be able to work around this by implementing an appropriate ``def + t_eof()`` end-of-file handling rule. The main complication here is + that you'll probably need to ensure that data is fed to the lexer in a + way so that it doesn't split in in the middle of a token. + +- If you need to supply optional flags to the re.compile() function, + use the reflags option to lex. For example:: + + lex.lex(reflags=re.UNICODE | re.VERBOSE) + + Note: by default, ``reflags`` is set to ``re.VERBOSE``. If you provide + your own flags, you may need to include this for PLY to preserve its normal behavior. + +- If you are going to create a hand-written lexer and you plan to use it with ``yacc.py``, + it only needs to conform to the following requirements: + + 1. It must provide a ``token()`` method that returns the next token or + ``None`` if no more tokens are available. + + 2. The ``token()`` method must return an object ``tok`` that has + ``type`` and ``value`` attributes. If line number tracking is + being used, then the token should also define a ``lineno`` + attribute. + +Parsing basics +-------------- + +``yacc.py`` is used to parse language syntax. Before showing an +example, there are a few important bits of background that must be +mentioned. First, *syntax* is usually specified in terms of a +BNF grammar. For example, if you wanted to parse simple arithmetic +expressions, you might first write an unambiguous grammar +specification like this:: + + expression : expression + term + | expression - term + | term + + term : term * factor + | term / factor + | factor + + factor : NUMBER + | ( expression ) + +In the grammar, symbols such as ``NUMBER``, ``+``, ``-``, ``*``, and +``/`` are known as *terminals* and correspond to input +tokens. Identifiers such as ``term`` and ``factor`` refer to grammar +rules comprised of a collection of terminals and other rules. These +identifiers are known as *non-terminals*. + +The semantic behavior of a language is often specified using a +technique known as syntax directed translation. In syntax directed +translation, attributes are attached to each symbol in a given grammar +rule along with an action. Whenever a particular grammar rule is +recognized, the action describes what to do. For example, given the +expression grammar above, you might write the specification for a +simple calculator like this:: + + Grammar Action + -------------------------------- -------------------------------------------- + expression0 : expression1 + term expression0.val = expression1.val + term.val + | expression1 - term expression0.val = expression1.val - term.val + | term expression0.val = term.val + + term0 : term1 * factor term0.val = term1.val * factor.val + | term1 / factor term0.val = term1.val / factor.val + | factor term0.val = factor.val + + factor : NUMBER factor.val = int(NUMBER.lexval) + | ( expression ) factor.val = expression.val + +A good way to think about syntax directed translation is to view each +symbol in the grammar as a kind of object. Associated with each symbol +is a value representing its "state" (for example, the ``val`` +attribute above). Semantic actions are then expressed as a collection +of functions or methods that operate on the symbols and associated +values. + +Yacc uses a parsing technique known as LR-parsing or shift-reduce +parsing. LR parsing is a bottom up technique that tries to recognize +the right-hand-side of various grammar rules. Whenever a valid +right-hand-side is found in the input, the appropriate action code is +triggered and the grammar symbols are replaced by the grammar symbol +on the left-hand-side. + +LR parsing is commonly implemented by shifting grammar symbols onto a +stack and looking at the stack and the next input token for patterns +that match one of the grammar rules. The details of the algorithm can +be found in a compiler textbook, but the following example illustrates +the steps that are performed if you wanted to parse the expression ``3 ++ 5 * (10 - 20)`` using the grammar defined above. In the example, +the special symbol ``$`` represents the end of input:: + + Step Symbol Stack Input Tokens Action + ---- --------------------- --------------------- ------------------------------- + 1 3 + 5 * ( 10 - 20 )$ Shift 3 + 2 3 + 5 * ( 10 - 20 )$ Reduce factor : NUMBER + 3 factor + 5 * ( 10 - 20 )$ Reduce term : factor + 4 term + 5 * ( 10 - 20 )$ Reduce expr : term + 5 expr + 5 * ( 10 - 20 )$ Shift + + 6 expr + 5 * ( 10 - 20 )$ Shift 5 + 7 expr + 5 * ( 10 - 20 )$ Reduce factor : NUMBER + 8 expr + factor * ( 10 - 20 )$ Reduce term : factor + 9 expr + term * ( 10 - 20 )$ Shift * + 10 expr + term * ( 10 - 20 )$ Shift ( + 11 expr + term * ( 10 - 20 )$ Shift 10 + 12 expr + term * ( 10 - 20 )$ Reduce factor : NUMBER + 13 expr + term * ( factor - 20 )$ Reduce term : factor + 14 expr + term * ( term - 20 )$ Reduce expr : term + 15 expr + term * ( expr - 20 )$ Shift - + 16 expr + term * ( expr - 20 )$ Shift 20 + 17 expr + term * ( expr - 20 )$ Reduce factor : NUMBER + 18 expr + term * ( expr - factor )$ Reduce term : factor + 19 expr + term * ( expr - term )$ Reduce expr : expr - term + 20 expr + term * ( expr )$ Shift ) + 21 expr + term * ( expr ) $ Reduce factor : (expr) + 22 expr + term * factor $ Reduce term : term * factor + 23 expr + term $ Reduce expr : expr + term + 24 expr $ Reduce expr + 25 $ Success! + +When parsing the expression, an underlying state machine and the +current input token determine what happens next. If the next token +looks like part of a valid grammar rule (based on other items on the +stack), it is generally shifted onto the stack. If the top of the +stack contains a valid right-hand-side of a grammar rule, it is +usually "reduced" and the symbols replaced with the symbol on the +left-hand-side. When this reduction occurs, the appropriate action is +triggered (if defined). If the input token can't be shifted and the +top of stack doesn't match any grammar rules, a syntax error has +occurred and the parser must take some kind of recovery step (or bail +out). A parse is only successful if the parser reaches a state where +the symbol stack is empty and there are no more input tokens. + +It is important to note that the underlying implementation is built +around a large finite-state machine that is encoded in a collection of +tables. The construction of these tables is non-trivial and +beyond the scope of this discussion. However, subtle details of this +process explain why, in the example above, the parser chooses to shift +a token onto the stack in step 9 rather than reducing the +rule ``expr : expr + term``. + +Yacc +---- + +The ``ply.yacc`` module implements the parsing component of PLY. +The name "yacc" stands for "Yet Another Compiler Compiler" and is +borrowed from the Unix tool of the same name. + +An example +^^^^^^^^^^ + +Suppose you wanted to make a grammar for simple arithmetic expressions +as previously described. Here is how you would do it with +``yacc.py``:: + + # Yacc example + + import ply.yacc as yacc + + # Get the token map from the lexer. This is required. + from calclex import tokens + + def p_expression_plus(p): + 'expression : expression PLUS term' + p[0] = p[1] + p[3] + + def p_expression_minus(p): + 'expression : expression MINUS term' + p[0] = p[1] - p[3] + + def p_expression_term(p): + 'expression : term' + p[0] = p[1] + + def p_term_times(p): + 'term : term TIMES factor' + p[0] = p[1] * p[3] + + def p_term_div(p): + 'term : term DIVIDE factor' + p[0] = p[1] / p[3] + + def p_term_factor(p): + 'term : factor' + p[0] = p[1] + + def p_factor_num(p): + 'factor : NUMBER' + p[0] = p[1] + + def p_factor_expr(p): + 'factor : LPAREN expression RPAREN' + p[0] = p[2] + + # Error rule for syntax errors + def p_error(p): + print("Syntax error in input!") + + # Build the parser + parser = yacc.yacc() + + while True: + try: + s = raw_input('calc > ') + except EOFError: + break + if not s: continue + result = parser.parse(s) + print(result) + +In this example, each grammar rule is defined by a Python function +where the docstring to that function contains the appropriate +context-free grammar specification. The statements that make up the +function body implement the semantic actions of the rule. Each +function accepts a single argument ``p`` that is a sequence containing +the values of each grammar symbol in the corresponding rule. The +values of ``p[i]`` are mapped to grammar symbols as shown here:: + + def p_expression_plus(p): + 'expression : expression PLUS term' + # ^ ^ ^ ^ + # p[0] p[1] p[2] p[3] + + p[0] = p[1] + p[3] + + +For tokens, the "value" of the corresponding ``p[i]`` is the +*same* as the ``p.value`` attribute assigned in the lexer +module. For non-terminals, the value is determined by whatever is +placed in ``p[0]`` when rules are reduced. This value can be anything +at all. However, it probably most common for the value to be a simple +Python type, a tuple, or an instance. In this example, we are relying +on the fact that the ``NUMBER`` token stores an integer value in its +value field. All of the other rules perform various types of +integer operations and propagate the result. + +Note: The use of negative indices have a special meaning in +yacc---specially ``p[-1]`` does not have the same value as ``p[3]`` in +this example. Please see the section on "Embedded Actions" for +further details. + +The first rule defined in the yacc specification determines the +starting grammar symbol (in this case, a rule for ``expression`` +appears first). Whenever the starting rule is reduced by the parser +and no more input is available, parsing stops and the final value is +returned (this value will be whatever the top-most rule placed in +``p[0]``). Note: an alternative starting symbol can be specified using +the ``start`` keyword argument to ``yacc()``. + +The ``p_error(p)`` rule is defined to catch syntax errors. See the +error handling section below for more detail. + +To build the parser, call the ``yacc.yacc()`` function. This function +looks at the module and attempts to construct all of the LR parsing +tables for the grammar you have specified. + +If any errors are detected in your grammar specification, ``yacc.py`` +will produce diagnostic messages and possibly raise an exception. +Some of the errors that can be detected include: + +- Duplicated function names (if more than one rule function have the same name in the grammar file). +- Shift/reduce and reduce/reduce conflicts generated by ambiguous grammars. +- Badly specified grammar rules. +- Infinite recursion (rules that can never terminate). +- Unused rules and tokens +- Undefined rules and tokens + +The next few sections discuss grammar specification in more detail. + +The final part of the example shows how to actually run the parser +created by ``yacc()``. To run the parser, you have to call the +``parse()`` with a string of input text. This will run all of the +grammar rules and return the result of the entire parse. This result +return is the value assigned to ``p[0]`` in the starting grammar rule. + +Combining Grammar Rule Functions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When grammar rules are similar, they can be combined into a single +function. For example, consider the two rules in our earlier +example:: + + def p_expression_plus(p): + 'expression : expression PLUS term' + p[0] = p[1] + p[3] + + def p_expression_minus(t): + 'expression : expression MINUS term' + p[0] = p[1] - p[3] + +Instead of writing two functions, you might write a single function +like this:: + + def p_expression(p): + '''expression : expression PLUS term + | expression MINUS term''' + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] + +In general, the doc string for any given function can contain multiple +grammar rules. So, it would have also been legal (although possibly +confusing) to write this:: + + def p_binary_operators(p): + '''expression : expression PLUS term + | expression MINUS term + term : term TIMES factor + | term DIVIDE factor''' + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] + elif p[2] == '*': + p[0] = p[1] * p[3] + elif p[2] == '/': + p[0] = p[1] / p[3] + +When combining grammar rules into a single function, it is usually a +good idea for all of the rules to have a similar structure (e.g., the +same number of terms). Otherwise, the corresponding action code may +be more complicated than necessary. However, it is possible to handle +simple cases using len(). For example:: + + def p_expressions(p): + '''expression : expression MINUS expression + | MINUS expression''' + if (len(p) == 4): + p[0] = p[1] - p[3] + elif (len(p) == 3): + p[0] = -p[2] + +If parsing performance is a concern, you should resist the urge to put +too much conditional processing into a single grammar rule as shown in +these examples. When you add checks to see which grammar rule is +being handled, you are actually duplicating the work that the parser +has already performed (i.e., the parser already knows exactly what +rule it matched). You can eliminate this overhead by using a separate +``p_rule()`` function for each grammar rule. + +Character Literals +^^^^^^^^^^^^^^^^^^ + +If desired, a grammar may contain tokens defined as single character +literals. For example:: + + def p_binary_operators(p): + '''expression : expression '+' term + | expression '-' term + term : term '*' factor + | term '/' factor''' + if p[2] == '+': + p[0] = p[1] + p[3] + elif p[2] == '-': + p[0] = p[1] - p[3] + elif p[2] == '*': + p[0] = p[1] * p[3] + elif p[2] == '/': + p[0] = p[1] / p[3] + +A character literal must be enclosed in quotes such as ``'+'``. In +addition, if literals are used, they must be declared in the +corresponding ``lex`` file through the use of a special ``literals`` +declaration:: + + # Literals. Should be placed in module given to lex() + literals = ['+','-','*','/' ] + +Character literals are limited to a single character. Thus, it is not +legal to specify literals such as ``'<='`` or ``'=='``. For this, +use the normal lexing rules (e.g., define a rule such as ``t_EQ = +r'=='``). + +Empty Productions +^^^^^^^^^^^^^^^^^ + +``yacc.py`` can handle empty productions by defining a rule like this:: + + def p_empty(p): + 'empty :' + pass + +Now to use the empty production, use 'empty' as a symbol. For +example:: + + def p_optitem(p): + 'optitem : item' + ' | empty' + ... + +Note: You can write empty rules anywhere by specifying an empty +right hand side. However, I personally find that writing an "empty" +rule and using "empty" to denote an empty production is easier to read +and more clearly states your intentions. + +Changing the starting symbol +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Normally, the first rule found in a yacc specification defines the +starting grammar rule (top level rule). To change this, supply +a ``start`` specifier in your file. For example:: + + start = 'foo' + + def p_bar(p): + 'bar : A B' + + # This is the starting rule due to the start specifier above + def p_foo(p): + 'foo : bar X' + ... + +The use of a ``start`` specifier may be useful during debugging +since you can use it to have yacc build a subset of a larger grammar. +For this purpose, it is also possible to specify a starting symbol as +an argument to ``yacc()``. For example:: + + parser = yacc.yacc(start='foo') + +Dealing With Ambiguous Grammars +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The expression grammar given in the earlier example has been written +in a special format to eliminate ambiguity. However, in many +situations, it is extremely difficult or awkward to write grammars in +this format. A much more natural way to express the grammar is in a +more compact form like this:: + + expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression + | LPAREN expression RPAREN + | NUMBER + +Unfortunately, this grammar specification is ambiguous. For example, +if you are parsing the string "3 * 4 + 5", there is no way to tell how +the operators are supposed to be grouped. For example, does the +expression mean "(3 * 4) + 5" or is it "3 * (4+5)"? + +When an ambiguous grammar is given to ``yacc.py`` it will print +messages about "shift/reduce conflicts" or "reduce/reduce conflicts". +A shift/reduce conflict is caused when the parser generator can't +decide whether or not to reduce a rule or shift a symbol on the +parsing stack. For example, consider the string "3 * 4 + 5" and the +internal parsing stack:: + + Step Symbol Stack Input Tokens Action + ---- --------------------- --------------------- ------------------------------- + 1 $ 3 * 4 + 5$ Shift 3 + 2 $ 3 * 4 + 5$ Reduce : expression : NUMBER + 3 $ expr * 4 + 5$ Shift * + 4 $ expr * 4 + 5$ Shift 4 + 5 $ expr * 4 + 5$ Reduce: expression : NUMBER + 6 $ expr * expr + 5$ SHIFT/REDUCE CONFLICT ???? + +In this case, when the parser reaches step 6, it has two options. One +is to reduce the rule ``expr : expr * expr`` on the stack. The other +option is to shift the token ``+`` on the stack. Both options are +perfectly legal from the rules of the context-free-grammar. + +By default, all shift/reduce conflicts are resolved in favor of +shifting. Therefore, in the above example, the parser will always +shift the ``+`` instead of reducing. Although this strategy works in +many cases (for example, the case of "if-then" versus "if-then-else"), +it is not enough for arithmetic expressions. In fact, in the above +example, the decision to shift ``+`` is completely wrong---we should +have reduced ``expr * expr`` since multiplication has higher +mathematical precedence than addition. + +To resolve ambiguity, especially in expression grammars, ``yacc.py`` +allows individual tokens to be assigned a precedence level and +associativity. This is done by adding a variable ``precedence`` to +the grammar file like this:: + + precedence = ( + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), + ) + +This declaration specifies that ``PLUS``/``MINUS`` have the same +precedence level and are left-associative and that +``TIMES``/``DIVIDE`` have the same precedence and are +left-associative. Within the ``precedence`` declaration, tokens are +ordered from lowest to highest precedence. Thus, this declaration +specifies that ``TIMES``/``DIVIDE`` have higher precedence than +``PLUS``/``MINUS`` (since they appear later in the precedence +specification). + +The precedence specification works by associating a numerical +precedence level value and associativity direction to the listed +tokens. For example, in the above example you get:: + + PLUS : level = 1, assoc = 'left' + MINUS : level = 1, assoc = 'left' + TIMES : level = 2, assoc = 'left' + DIVIDE : level = 2, assoc = 'left' + +These values are then used to attach a numerical precedence value and +associativity direction to each grammar rule. *This is always +determined by looking at the precedence of the right-most terminal +symbol.* For example:: + + expression : expression PLUS expression # level = 1, left + | expression MINUS expression # level = 1, left + | expression TIMES expression # level = 2, left + | expression DIVIDE expression # level = 2, left + | LPAREN expression RPAREN # level = None (not specified) + | NUMBER # level = None (not specified) + +When shift/reduce conflicts are encountered, the parser generator +resolves the conflict by looking at the precedence rules and +associativity specifiers. + +1. If the current token has higher precedence than the rule on the stack, it is shifted. + +2. If the grammar rule on the stack has higher precedence, the rule is reduced. + +3. If the current token and the grammar rule have the same precedence, the + rule is reduced for left associativity, whereas the token is shifted for right associativity. + +4. If nothing is known about the precedence, shift/reduce conflicts are resolved in + favor of shifting (the default). + +For example, if "expression PLUS expression" has been parsed and the +next token is "TIMES", the action is going to be a shift because +"TIMES" has a higher precedence level than "PLUS". On the other hand, +if "expression TIMES expression" has been parsed and the next token is +"PLUS", the action is going to be reduce because "PLUS" has a lower +precedence than "TIMES." + +When shift/reduce conflicts are resolved using the first three +techniques (with the help of precedence rules), ``yacc.py`` will +report no errors or conflicts in the grammar (although it will print +some information in the ``parser.out`` debugging file). + +One problem with the precedence specifier technique is that it is +sometimes necessary to change the precedence of an operator in certain +contexts. For example, consider a unary-minus operator in "3 + 4 * +-5". Mathematically, the unary minus is normally given a very high +precedence--being evaluated before the multiply. However, in our +precedence specifier, MINUS has a lower precedence than TIMES. To +deal with this, precedence rules can be given for so-called +"fictitious tokens" like this:: + + precedence = ( + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), + ('right', 'UMINUS'), # Unary minus operator + ) + +Now, in the grammar file, we can write our unary minus rule like +this:: + + def p_expr_uminus(p): + 'expression : MINUS expression %prec UMINUS' + p[0] = -p[2] + +In this case, ``%prec UMINUS`` overrides the default rule +precedence--setting it to that of UMINUS in the precedence specifier. + +At first, the use of UMINUS in this example may appear very confusing. +UMINUS is not an input token or a grammar rule. Instead, you should +think of it as the name of a special marker in the precedence table. +When you use the ``%prec`` qualifier, you're telling yacc that +you want the precedence of the expression to be the same as for this +special marker instead of the usual precedence. + +It is also possible to specify non-associativity in the ``precedence`` +table. This would be used when you *don't* want operations to +chain together. For example, suppose you wanted to support comparison +operators like ``<`` and ``>`` but you didn't want to allow +combinations like ``a < b < c``. To do this, specify a +rule like this:: + + precedence = ( + ('nonassoc', 'LESSTHAN', 'GREATERTHAN'), # Nonassociative operators + ('left', 'PLUS', 'MINUS'), + ('left', 'TIMES', 'DIVIDE'), + ('right', 'UMINUS'), # Unary minus operator + ) + +If you do this, the occurrence of input text such as ``a < b < c`` +will result in a syntax error. However, simple expressions such +as ``a < b`` will still be fine. + +Reduce/reduce conflicts are caused when there are multiple grammar +rules that can be applied to a given set of symbols. This kind of +conflict is almost always bad and is always resolved by picking the +rule that appears first in the grammar file. Reduce/reduce conflicts +are almost always caused when different sets of grammar rules somehow +generate the same set of symbols. For example:: + + assignment : ID EQUALS NUMBER + | ID EQUALS expression + + expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression + | LPAREN expression RPAREN + | NUMBER + +In this case, a reduce/reduce conflict exists between these two rules:: + + assignment : ID EQUALS NUMBER + expression : NUMBER + +For example, if you wrote "a = 5", the parser can't figure out if this +is supposed to be reduced as ``assignment : ID EQUALS NUMBER`` or +whether it's supposed to reduce the 5 as an expression and then reduce +the rule ``assignment : ID EQUALS expression``. + +It should be noted that reduce/reduce conflicts are notoriously +difficult to spot looking at the input grammar. When a +reduce/reduce conflict occurs, ``yacc()`` will try to help by printing +a warning message such as this:: + + WARNING: 1 reduce/reduce conflict + WARNING: reduce/reduce conflict in state 15 resolved using rule (assignment -> ID EQUALS NUMBER) + WARNING: rejected rule (expression -> NUMBER) + +This message identifies the two rules that are in conflict. However, +it may not tell you how the parser arrived at such a state. To try +and figure it out, you'll probably have to look at your grammar and +the contents of the ``parser.out`` debugging file with an +appropriately high level of caffeination. + +The parser.out file +^^^^^^^^^^^^^^^^^^^ + +Tracking down shift/reduce and reduce/reduce conflicts is one of the +finer pleasures of using an LR parsing algorithm. To assist in +debugging, ``yacc.py`` can create a debugging file called 'parser.out'. +To create this file, use ``yacc.yacc(debug=True)``. +The contents of this file look like the following:: + + Unused terminals: + + + Grammar + + Rule 1 expression -> expression PLUS expression + Rule 2 expression -> expression MINUS expression + Rule 3 expression -> expression TIMES expression + Rule 4 expression -> expression DIVIDE expression + Rule 5 expression -> NUMBER + Rule 6 expression -> LPAREN expression RPAREN + + Terminals, with rules where they appear + + TIMES : 3 + error : + MINUS : 2 + RPAREN : 6 + LPAREN : 6 + DIVIDE : 4 + PLUS : 1 + NUMBER : 5 + + Nonterminals, with rules where they appear + + expression : 1 1 2 2 3 3 4 4 6 0 + + + Parsing method: LALR + + + state 0 + + S' -> . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + + state 1 + + S' -> expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + PLUS shift and go to state 6 + MINUS shift and go to state 5 + TIMES shift and go to state 4 + DIVIDE shift and go to state 7 + + + state 2 + + expression -> LPAREN . expression RPAREN + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + + state 3 + + expression -> NUMBER . + + $ reduce using rule 5 + PLUS reduce using rule 5 + MINUS reduce using rule 5 + TIMES reduce using rule 5 + DIVIDE reduce using rule 5 + RPAREN reduce using rule 5 + + + state 4 + + expression -> expression TIMES . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + + state 5 + + expression -> expression MINUS . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + + state 6 + + expression -> expression PLUS . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + + state 7 + + expression -> expression DIVIDE . expression + expression -> . expression PLUS expression + expression -> . expression MINUS expression + expression -> . expression TIMES expression + expression -> . expression DIVIDE expression + expression -> . NUMBER + expression -> . LPAREN expression RPAREN + + NUMBER shift and go to state 3 + LPAREN shift and go to state 2 + + + state 8 + + expression -> LPAREN expression . RPAREN + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + RPAREN shift and go to state 13 + PLUS shift and go to state 6 + MINUS shift and go to state 5 + TIMES shift and go to state 4 + DIVIDE shift and go to state 7 + + + state 9 + + expression -> expression TIMES expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + $ reduce using rule 3 + PLUS reduce using rule 3 + MINUS reduce using rule 3 + TIMES reduce using rule 3 + DIVIDE reduce using rule 3 + RPAREN reduce using rule 3 + + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + ! TIMES [ shift and go to state 4 ] + ! DIVIDE [ shift and go to state 7 ] + + state 10 + + expression -> expression MINUS expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + $ reduce using rule 2 + PLUS reduce using rule 2 + MINUS reduce using rule 2 + RPAREN reduce using rule 2 + TIMES shift and go to state 4 + DIVIDE shift and go to state 7 + + ! TIMES [ reduce using rule 2 ] + ! DIVIDE [ reduce using rule 2 ] + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + + state 11 + + expression -> expression PLUS expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + $ reduce using rule 1 + PLUS reduce using rule 1 + MINUS reduce using rule 1 + RPAREN reduce using rule 1 + TIMES shift and go to state 4 + DIVIDE shift and go to state 7 + + ! TIMES [ reduce using rule 1 ] + ! DIVIDE [ reduce using rule 1 ] + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + + state 12 + + expression -> expression DIVIDE expression . + expression -> expression . PLUS expression + expression -> expression . MINUS expression + expression -> expression . TIMES expression + expression -> expression . DIVIDE expression + + $ reduce using rule 4 + PLUS reduce using rule 4 + MINUS reduce using rule 4 + TIMES reduce using rule 4 + DIVIDE reduce using rule 4 + RPAREN reduce using rule 4 + + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + ! TIMES [ shift and go to state 4 ] + ! DIVIDE [ shift and go to state 7 ] + + state 13 + + expression -> LPAREN expression RPAREN . + + $ reduce using rule 6 + PLUS reduce using rule 6 + MINUS reduce using rule 6 + TIMES reduce using rule 6 + DIVIDE reduce using rule 6 + RPAREN reduce using rule 6 + +The different states that appear in this file are a representation of +every possible sequence of valid input tokens allowed by the grammar. +When receiving input tokens, the parser is building up a stack and +looking for matching rules. Each state keeps track of the grammar +rules that might be in the process of being matched at that point. +Within each rule, the "." character indicates the current location of +the parse within that rule. In addition, the actions for each valid +input token are listed. When a shift/reduce or reduce/reduce conflict +arises, rules *not* selected are prefixed with an !. For +example:: + + ! TIMES [ reduce using rule 2 ] + ! DIVIDE [ reduce using rule 2 ] + ! PLUS [ shift and go to state 6 ] + ! MINUS [ shift and go to state 5 ] + +By looking at these rules (and with a little practice), you can +usually track down the source of most parsing conflicts. It should +also be stressed that not all shift-reduce conflicts are bad. +However, the only way to be sure that they are resolved correctly is +to look at ``parser.out``. + +Syntax Error Handling +^^^^^^^^^^^^^^^^^^^^^ + +If you are creating a parser for production use, the handling of +syntax errors is important. As a general rule, you don't want a +parser to throw up its hands and stop at the first sign of +trouble. Instead, you want it to report the error, recover if +possible, and continue parsing so that all of the errors in the input +get reported to the user at once. This is the standard behavior found +in compilers for languages such as C, C++, and Java. + +In PLY, when a syntax error occurs during parsing, the error is +immediately detected (i.e., the parser does not read any more tokens +beyond the source of the error). However, at this point, the parser +enters a recovery mode that can be used to try and continue further +parsing. As a general rule, error recovery in LR parsers is a +delicate topic that involves ancient rituals and black-magic. The +recovery mechanism provided by ``yacc.py`` is comparable to Unix yacc +so you may want consult a book like O'Reilly's "Lex and Yacc" for some +of the finer details. + +When a syntax error occurs, ``yacc.py`` performs the following steps: + +1. On the first occurrence of an error, the user-defined ``p_error()`` + function is called with the offending token as an + argument. However, if the syntax error is due to reaching the + end-of-file, ``p_error()`` is called with an argument of ``None``. + Afterwards, the parser enters an "error-recovery" mode in which it + will not make future calls to ``p_error()`` until it has + successfully shifted at least 3 tokens onto the parsing stack. + + +2. If no recovery action is taken in ``p_error()``, the offending + lookahead token is replaced with a special ``error`` token. + +3. If the offending lookahead token is already set to ``error``, the + top item of the parsing stack is deleted. + +4. If the entire parsing stack is unwound, the parser enters a restart + state and attempts to start parsing from its initial state. + +5. If a grammar rule accepts ``error`` as a token, it will be + shifted onto the parsing stack. + +6. If the top item of the parsing stack is ``error``, lookahead tokens + will be discarded until the parser can successfully shift a new + symbol or reduce a rule involving ``error``. + +Recovery and resynchronization with error rules +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The most well-behaved approach for handling syntax errors is to write +grammar rules that include the ``error`` token. For example, suppose +your language had a grammar rule for a print statement like this:: + + def p_statement_print(p): + 'statement : PRINT expr SEMI' + ... + +To account for the possibility of a bad expression, you might write an +additional grammar rule like this:: + + def p_statement_print_error(p): + 'statement : PRINT error SEMI' + print("Syntax error in print statement. Bad expression") + +In this case, the ``error`` token will match any sequence of +tokens that might appear up to the first semicolon that is +encountered. Once the semicolon is reached, the rule will be +invoked and the ``error`` token will go away. + +This type of recovery is sometimes known as parser resynchronization. +The ``error`` token acts as a wildcard for any bad input text and +the token immediately following ``error`` acts as a +synchronization token. + +It is important to note that the ``error`` token usually does not +appear as the last token on the right in an error rule. For example:: + + def p_statement_print_error(p): + 'statement : PRINT error' + print("Syntax error in print statement. Bad expression") + +This is because the first bad token encountered will cause the rule to +be reduced--which may make it difficult to recover if more bad tokens +immediately follow. + +Panic mode recovery +~~~~~~~~~~~~~~~~~~~ + +An alternative error recovery scheme is to enter a panic mode recovery +in which tokens are discarded to a point where the parser might be +able to recover in some sensible manner. + +Panic mode recovery is implemented entirely in the ``p_error()`` +function. For example, this function starts discarding tokens until +it reaches a closing '}'. Then, it restarts the parser in its initial +state:: + + def p_error(p): + print("Whoa. You are seriously hosed.") + if not p: + print("End of File!") + return + + # Read ahead looking for a closing '}' + while True: + tok = parser.token() # Get the next token + if not tok or tok.type == 'RBRACE': + break + parser.restart() + +This function discards the bad token and tells the parser that +the error was ok:: + + def p_error(p): + if p: + print("Syntax error at token", p.type) + # Just discard the token and tell the parser it's okay. + parser.errok() + else: + print("Syntax error at EOF") + +More information on these methods is as follows: + +``parser.errok()`` + This resets the parser state so it doesn't think it's in error-recovery + mode. This will prevent an ``error`` token from being generated and will reset the internal + error counters so that the next syntax error will call ``p_error()`` again. + +``parser.token()`` + This returns the next token on the input stream. + +``parser.restart()``. + This discards the entire parsing stack and resets the parser + to its initial state. + +To supply the next lookahead token to the parser, ``p_error()`` can +return a token. This might be useful if trying to synchronize on +special characters. For example:: + + def p_error(p): + # Read ahead looking for a terminating ";" + while True: + tok = parser.token() # Get the next token + if not tok or tok.type == 'SEMI': break + parser.errok() + + # Return SEMI to the parser as the next lookahead token + return tok + +Keep in mind in that the above error handling functions, ``parser`` is +an instance of the parser created by ``yacc()``. You'll need to save +this instance someplace in your code so that you can refer to it +during error handling. + +Signalling an error from a production +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If necessary, a production rule can manually force the parser to enter +error recovery. This is done by raising the ``SyntaxError`` exception +like this:: + + def p_production(p): + 'production : some production ...' + raise SyntaxError + +The effect of raising ``SyntaxError`` is the same as if the last +symbol shifted onto the parsing stack was actually a syntax error. +Thus, when you do this, the last symbol shifted is popped off of the +parsing stack and the current lookahead token is set to an ``error`` +token. The parser then enters error-recovery mode where it tries to +reduce rules that can accept ``error`` tokens. The steps that follow +from this point are exactly the same as if a syntax error were +detected and ``p_error()`` were called. + +One important aspect of manually setting an error is that the +``p_error()`` function will NOT be called in this case. If you need +to issue an error message, make sure you do it in the production that +raises ``SyntaxError``. + +Note: This feature of PLY is meant to mimic the behavior of the +YYERROR macro in yacc. + +When Do Syntax Errors Get Reported? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In most cases, yacc will handle errors as soon as a bad input token is +detected on the input. However, be aware that yacc may choose to +delay error handling until after it has reduced one or more grammar +rules first. This behavior might be unexpected, but it's related to +special states in the underlying parsing table known as "defaulted +states." A defaulted state is parsing condition where the same +grammar rule will be reduced regardless of what *valid* token +comes next on the input. For such states, yacc chooses to go ahead +and reduce the grammar rule *without reading the next input +token*. If the next token is bad, yacc will eventually get around +to reading it and report a syntax error. It's just a little unusual +in that you might see some of your grammar rules firing immediately +prior to the syntax error. + +Usually, the delayed error reporting with defaulted states is harmless +(and there are other reasons for wanting PLY to behave in this way). +However, if you need to turn this behavior off for some reason. You +can clear the defaulted states table like this:: + + parser = yacc.yacc() + parser.defaulted_states = {} + +Disabling defaulted states is not recommended if your grammar makes +use of embedded actions as described in Section 6.11. + +General comments on error handling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For normal types of languages, error recovery with error rules and +resynchronization characters is probably the most reliable +technique. This is because you can instrument the grammar to catch +errors at selected places where it is relatively easy to recover and +continue parsing. Panic mode recovery is really only useful in +certain specialized applications where you might want to discard huge +portions of the input text to find a valid restart point. + +Line Number and Position Tracking +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Position tracking is often a tricky problem when writing compilers. +By default, PLY tracks the line number and position of all tokens. +This information is available using the following functions: + +``p.lineno(num)``. Return the line number for symbol *num* + +``p.lexpos(num)``. Return the lexing position for symbol *num* + +For example:: + + def p_expression(p): + 'expression : expression PLUS expression' + line = p.lineno(2) # line number of the PLUS token + index = p.lexpos(2) # Position of the PLUS token + +As an optional feature, ``yacc.py`` can automatically track line +numbers and positions for all of the grammar symbols as well. +However, this extra tracking requires extra processing and can +significantly slow down parsing. Therefore, it must be enabled by +passing the ``tracking=True`` option to ``yacc.parse()``. For +example:: + + yacc.parse(data,tracking=True) + +Once enabled, the ``lineno()`` and ``lexpos()`` methods work for all +grammar symbols. In addition, two additional methods can be used: + +``p.linespan(num)``. Return a tuple (startline,endline) with the starting and ending line number for symbol *num*. + +``p.lexspan(num)``. Return a tuple (start,end) with the starting and ending positions for symbol *num*. + +For example:: + + def p_expression(p): + 'expression : expression PLUS expression' + p.lineno(1) # Line number of the left expression + p.lineno(2) # line number of the PLUS operator + p.lineno(3) # line number of the right expression + ... + start,end = p.linespan(3) # Start,end lines of the right expression + starti,endi = p.lexspan(3) # Start,end positions of right expression + + +Note: The ``lexspan()`` function only returns the range of values up +to the start of the last grammar symbol. + +Although it may be convenient for PLY to track position information on +all grammar symbols, this is often unnecessary. For example, if you +are merely using line number information in an error message, you can +often just key off of a specific token in the grammar rule. For +example:: + + def p_bad_func(p): + 'funccall : fname LPAREN error RPAREN' + # Line number reported from LPAREN token + print("Bad function call at line", p.lineno(2)) + +Similarly, you may get better parsing performance if you only +selectively propagate line number information where it's needed using +the ``p.set_lineno()`` method. For example:: + + def p_fname(p): + 'fname : ID' + p[0] = p[1] + p.set_lineno(0,p.lineno(1)) + +PLY doesn't retain line number information from rules that have +already been parsed. If you are building an abstract syntax tree and +need to have line numbers, you should make sure that the line numbers +appear in the tree itself. + +AST Construction +^^^^^^^^^^^^^^^^ + +``yacc.py`` provides no special functions for constructing an abstract +syntax tree. However, such construction is easy enough to do on your +own. + +A minimal way to construct a tree is to create and propagate a +tuple or list in each grammar rule function. There are many possible +ways to do this, but one example would be something like this:: + + def p_expression_binop(p): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + + p[0] = ('binary-expression',p[2],p[1],p[3]) + + def p_expression_group(p): + 'expression : LPAREN expression RPAREN' + p[0] = ('group-expression',p[2]) + + def p_expression_number(p): + 'expression : NUMBER' + p[0] = ('number-expression',p[1]) + +Another approach is to create a set of data structure for different +kinds of abstract syntax tree nodes and assign nodes to ``p[0]`` in +each rule. For example:: + + class Expr: pass + + class BinOp(Expr): + def __init__(self,left,op,right): + self.left = left + self.right = right + self.op = op + + class Number(Expr): + def __init__(self,value): + self.value = value + + def p_expression_binop(p): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + + p[0] = BinOp(p[1],p[2],p[3]) + + def p_expression_group(p): + 'expression : LPAREN expression RPAREN' + p[0] = p[2] + + def p_expression_number(p): + 'expression : NUMBER' + p[0] = Number(p[1]) + +The advantage to this approach is that it may make it easier to attach +more complicated semantics, type checking, code generation, and other +features to the node classes. + +To simplify tree traversal, it may make sense to pick a very generic +tree structure for your parse tree nodes. For example:: + + class Node: + def __init__(self,type,children=None,leaf=None): + self.type = type + if children: + self.children = children + else: + self.children = [ ] + self.leaf = leaf + + def p_expression_binop(p): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + + p[0] = Node("binop", [p[1],p[3]], p[2]) + +Embedded Actions +^^^^^^^^^^^^^^^^ + +The parsing technique used by yacc only allows actions to be executed +at the end of a rule. For example, suppose you have a rule like +this:: + + def p_foo(p): + "foo : A B C D" + print("Parsed a foo", p[1],p[2],p[3],p[4]) + +In this case, the supplied action code only executes after all of the +symbols ``A``, ``B``, ``C``, and ``D`` have been parsed. Sometimes, +however, it is useful to execute small code fragments during +intermediate stages of parsing. For example, suppose you wanted to +perform some action immediately after ``A`` has been parsed. To do +this, write an empty rule like this:: + + def p_foo(p): + "foo : A seen_A B C D" + print("Parsed a foo", p[1],p[3],p[4],p[5]) + print("seen_A returned", p[2]) + + def p_seen_A(p): + "seen_A :" + print("Saw an A = ", p[-1]) # Access grammar symbol to left + p[0] = some_value # Assign value to seen_A + +In this example, the empty ``seen_A`` rule executes immediately after +``A`` is shifted onto the parsing stack. Within this rule, ``p[-1]`` +refers to the symbol on the stack that appears immediately to the left +of the ``seen_A`` symbol. In this case, it would be the value of +``A`` in the ``foo`` rule immediately above. Like other rules, a +value can be returned from an embedded action by assigning it +to ``p[0]`` + +The use of embedded actions can sometimes introduce extra shift/reduce +conflicts. For example, this grammar has no conflicts:: + + def p_foo(p): + """foo : abcd + | abcx""" + + def p_abcd(p): + "abcd : A B C D" + + def p_abcx(p): + "abcx : A B C X" + +However, if you insert an embedded action into one of the rules like +this:: + + def p_foo(p): + """foo : abcd + | abcx""" + + def p_abcd(p): + "abcd : A B C D" + + def p_abcx(p): + "abcx : A B seen_AB C X" + + def p_seen_AB(p): + "seen_AB :" + +an extra shift-reduce conflict will be introduced. This conflict is +caused by the fact that the same symbol ``C`` appears next in both the +``abcd`` and ``abcx`` rules. The parser can either shift the symbol +(``abcd`` rule) or reduce the empty rule ``seen_AB`` (``abcx`` rule). + +A common use of embedded rules is to control other aspects of parsing +such as scoping of local variables. For example, if you were parsing +C code, you might write code like this:: + + def p_statements_block(p): + "statements: LBRACE new_scope statements RBRACE""" + # Action code + ... + pop_scope() # Return to previous scope + + def p_new_scope(p): + "new_scope :" + # Create a new scope for local variables + s = new_scope() + push_scope(s) + ... + +In this case, the embedded action ``new_scope`` executes +immediately after a ``LBRACE`` (``{``) symbol is parsed. +This might adjust internal symbol tables and other aspects of the +parser. Upon completion of the rule ``statements_block``, code +might undo the operations performed in the embedded action +(e.g., ``pop_scope()``). + +Miscellaneous Yacc Notes +^^^^^^^^^^^^^^^^^^^^^^^^ + + +1. By default, ``yacc.py`` relies on ``lex.py`` for tokenizing. However, an alternative tokenizer + can be supplied as follows:: + + parser = yacc.parse(lexer=x) + + in this case, ``x`` must be a Lexer object that minimally has a ``x.token()`` method for retrieving the next + token. If an input string is given to ``yacc.parse()``, the lexer must also have an ``x.input()`` method. + +2. To print copious amounts of debugging during parsing, use:: + + parser.parse(input_text, debug=True) + +3. Since LR parsing is driven by tables, the performance of the parser is largely independent of the + size of the grammar. The biggest bottlenecks will be the lexer and the complexity of the code in your grammar rules. + +4. ``yacc()`` also allows parsers to be defined as classes and as closures (see the section on alternative specification of + lexers). However, be aware that only one parser may be defined in a single module (source file). There are various + error checks and validation steps that may issue confusing error messages if you try to define multiple parsers + in the same source file. + +Multiple Parsers and Lexers +--------------------------- + +In advanced parsing applications, you may want to have multiple +parsers and lexers. + +As a general rules this isn't a problem. However, to make it work, +you need to carefully make sure everything gets hooked up correctly. +First, make sure you save the objects returned by ``lex()`` and +``yacc()``. For example:: + + lexer = lex.lex() # Return lexer object + parser = yacc.yacc() # Return parser object + +Next, when parsing, make sure you give the ``parse()`` function a +reference to the lexer it should be using. For example:: + + parser.parse(text,lexer=lexer) + +If you forget to do this, the parser will use the last lexer +created--which is not always what you want. + +Within lexer and parser rule functions, these objects are also +available. In the lexer, the "lexer" attribute of a token refers to +the lexer object that triggered the rule. For example:: + + def t_NUMBER(t): + r'\d+' + ... + print(t.lexer) # Show lexer object + +In the parser, the "lexer" and "parser" attributes refer to the lexer +and parser objects respectively:: + + def p_expr_plus(p): + 'expr : expr PLUS expr' + ... + print(p.parser) # Show parser object + print(p.lexer) # Show lexer object + +If necessary, arbitrary attributes can be attached to the lexer or +parser object. For example, if you wanted to have different parsing +modes, you could attach a mode attribute to the parser object and look +at it later. + +Advanced Debugging +------------------ + +Debugging a compiler is typically not an easy task. PLY provides some +diagostic capabilities through the use of Python's +``logging`` module. The next two sections describe this: + +Debugging the lex() and yacc() commands +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Both the ``lex()`` and ``yacc()`` commands have a debugging mode that +can be enabled using the ``debug`` flag. For example:: + + lex.lex(debug=True) + yacc.yacc(debug=True) + +Normally, the output produced by debugging is routed to either +standard error or, in the case of ``yacc()``, to a file +``parser.out``. This output can be more carefully controlled by +supplying a logging object. Here is an example that adds information +about where different debugging messages are coming from:: + + # Set up a logging object + import logging + logging.basicConfig( + level = logging.DEBUG, + filename = "parselog.txt", + filemode = "w", + format = "%(filename)10s:%(lineno)4d:%(message)s" + ) + log = logging.getLogger() + + lex.lex(debug=True,debuglog=log) + yacc.yacc(debug=True,debuglog=log) + +If you supply a custom logger, the amount of debugging information +produced can be controlled by setting the logging level. Typically, +debugging messages are either issued at the ``DEBUG``, ``INFO``, or +``WARNING`` levels. + +PLY's error messages and warnings are also produced using the logging +interface. This can be controlled by passing a logging object using +the ``errorlog`` parameter:: + + lex.lex(errorlog=log) + yacc.yacc(errorlog=log) + +If you want to completely silence warnings, you can either pass in a +logging object with an appropriate filter level or use the +``NullLogger`` object defined in either ``lex`` or ``yacc``. For +example:: + + yacc.yacc(errorlog=yacc.NullLogger()) + +Run-time Debugging +^^^^^^^^^^^^^^^^^^ + +To enable run-time debugging of a parser, use the ``debug`` option to +parse. This option can either be an integer (which turns +debugging on or off) or an instance of a logger object. For example:: + + log = logging.getLogger() + parser.parse(input,debug=log) + +If a logging object is passed, you can use its filtering level to +control how much output gets generated. The ``INFO`` level is used to +produce information about rule reductions. The ``DEBUG`` level will +show information about the parsing stack, token shifts, and other +details. The ``ERROR`` level shows information related to parsing +errors. + +For very complicated problems, you should pass in a logging object that +redirects to a file where you can more easily inspect the output after +execution. + +Where to go from here? +---------------------- + +The ``examples`` directory of the PLY distribution contains several +simple examples. Please consult a compilers textbook for the theory +and underlying implementation details or LR parsing. + + + + + + + + diff --git a/example/BASIC/basic.py b/example/BASIC/basic.py index 17687b1..8a8a500 100644 --- a/example/BASIC/basic.py +++ b/example/BASIC/basic.py @@ -4,9 +4,6 @@ import sys sys.path.insert(0, "../..") -if sys.version_info[0] >= 3: - raw_input = input - import basiclex import basparse import basinterp @@ -36,9 +33,9 @@ # Specifying a line number with no code deletes that line from # the program. -while 1: +while True: try: - line = raw_input("[BASIC] ") + line = input("[BASIC] ") except EOFError: raise SystemExit if not line: diff --git a/example/GardenSnake/GardenSnake.py b/example/GardenSnake/GardenSnake.py deleted file mode 100644 index 8b493b4..0000000 --- a/example/GardenSnake/GardenSnake.py +++ /dev/null @@ -1,777 +0,0 @@ -# GardenSnake - a parser generator demonstration program -# -# This implements a modified version of a subset of Python: -# - only 'def', 'return' and 'if' statements -# - 'if' only has 'then' clause (no elif nor else) -# - single-quoted strings only, content in raw format -# - numbers are decimal.Decimal instances (not integers or floats) -# - no print statment; use the built-in 'print' function -# - only < > == + - / * implemented (and unary + -) -# - assignment and tuple assignment work -# - no generators of any sort -# - no ... well, no quite a lot - -# Why? I'm thinking about a new indentation-based configuration -# language for a project and wanted to figure out how to do it. Once -# I got that working I needed a way to test it out. My original AST -# was dumb so I decided to target Python's AST and compile it into -# Python code. Plus, it's pretty cool that it only took a day or so -# from sitting down with Ply to having working code. - -# This uses David Beazley's Ply from http://www.dabeaz.com/ply/ - -# This work is hereby released into the Public Domain. To view a copy of -# the public domain dedication, visit -# http://creativecommons.org/licenses/publicdomain/ or send a letter to -# Creative Commons, 543 Howard Street, 5th Floor, San Francisco, -# California, 94105, USA. -# -# Portions of this work are derived from Python's Grammar definition -# and may be covered under the Python copyright and license -# -# Andrew Dalke / Dalke Scientific Software, LLC -# 30 August 2006 / Cape Town, South Africa - -# Changelog: -# 30 August - added link to CC license; removed the "swapcase" encoding - -# Modifications for inclusion in PLY distribution -import sys -sys.path.insert(0, "../..") -from ply import * - -##### Lexer ###### -#import lex -import decimal - -tokens = ( - 'DEF', - 'IF', - 'NAME', - 'NUMBER', # Python decimals - 'STRING', # single quoted strings only; syntax of raw strings - 'LPAR', - 'RPAR', - 'COLON', - 'EQ', - 'ASSIGN', - 'LT', - 'GT', - 'PLUS', - 'MINUS', - 'MULT', - 'DIV', - 'RETURN', - 'WS', - 'NEWLINE', - 'COMMA', - 'SEMICOLON', - 'INDENT', - 'DEDENT', - 'ENDMARKER', -) - -#t_NUMBER = r'\d+' -# taken from decmial.py but without the leading sign - - -def t_NUMBER(t): - r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?""" - t.value = decimal.Decimal(t.value) - return t - - -def t_STRING(t): - r"'([^\\']+|\\'|\\\\)*'" # I think this is right ... - t.value = t.value[1:-1].decode("string-escape") # .swapcase() # for fun - return t - -t_COLON = r':' -t_EQ = r'==' -t_ASSIGN = r'=' -t_LT = r'<' -t_GT = r'>' -t_PLUS = r'\+' -t_MINUS = r'-' -t_MULT = r'\*' -t_DIV = r'/' -t_COMMA = r',' -t_SEMICOLON = r';' - -# Ply nicely documented how to do this. - -RESERVED = { - "def": "DEF", - "if": "IF", - "return": "RETURN", -} - - -def t_NAME(t): - r'[a-zA-Z_][a-zA-Z0-9_]*' - t.type = RESERVED.get(t.value, "NAME") - return t - -# Putting this before t_WS let it consume lines with only comments in -# them so the latter code never sees the WS part. Not consuming the -# newline. Needed for "if 1: #comment" - - -def t_comment(t): - r"[ ]*\043[^\n]*" # \043 is '#' - pass - - -# Whitespace -def t_WS(t): - r' [ ]+ ' - if t.lexer.at_line_start and t.lexer.paren_count == 0: - return t - -# Don't generate newline tokens when inside of parenthesis, eg -# a = (1, -# 2, 3) - - -def t_newline(t): - r'\n+' - t.lexer.lineno += len(t.value) - t.type = "NEWLINE" - if t.lexer.paren_count == 0: - return t - - -def t_LPAR(t): - r'\(' - t.lexer.paren_count += 1 - return t - - -def t_RPAR(t): - r'\)' - # check for underflow? should be the job of the parser - t.lexer.paren_count -= 1 - return t - - -def t_error(t): - raise SyntaxError("Unknown symbol %r" % (t.value[0],)) - print "Skipping", repr(t.value[0]) - t.lexer.skip(1) - -# I implemented INDENT / DEDENT generation as a post-processing filter - -# The original lex token stream contains WS and NEWLINE characters. -# WS will only occur before any other tokens on a line. - -# I have three filters. One tags tokens by adding two attributes. -# "must_indent" is True if the token must be indented from the -# previous code. The other is "at_line_start" which is True for WS -# and the first non-WS/non-NEWLINE on a line. It flags the check so -# see if the new line has changed indication level. - -# Python's syntax has three INDENT states -# 0) no colon hence no need to indent -# 1) "if 1: go()" - simple statements have a COLON but no need for an indent -# 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent -NO_INDENT = 0 -MAY_INDENT = 1 -MUST_INDENT = 2 - -# only care about whitespace at the start of a line - - -def track_tokens_filter(lexer, tokens): - lexer.at_line_start = at_line_start = True - indent = NO_INDENT - saw_colon = False - for token in tokens: - token.at_line_start = at_line_start - - if token.type == "COLON": - at_line_start = False - indent = MAY_INDENT - token.must_indent = False - - elif token.type == "NEWLINE": - at_line_start = True - if indent == MAY_INDENT: - indent = MUST_INDENT - token.must_indent = False - - elif token.type == "WS": - assert token.at_line_start == True - at_line_start = True - token.must_indent = False - - else: - # A real token; only indent after COLON NEWLINE - if indent == MUST_INDENT: - token.must_indent = True - else: - token.must_indent = False - at_line_start = False - indent = NO_INDENT - - yield token - lexer.at_line_start = at_line_start - - -def _new_token(type, lineno): - tok = lex.LexToken() - tok.type = type - tok.value = None - tok.lineno = lineno - return tok - -# Synthesize a DEDENT tag - - -def DEDENT(lineno): - return _new_token("DEDENT", lineno) - -# Synthesize an INDENT tag - - -def INDENT(lineno): - return _new_token("INDENT", lineno) - - -# Track the indentation level and emit the right INDENT / DEDENT events. -def indentation_filter(tokens): - # A stack of indentation levels; will never pop item 0 - levels = [0] - token = None - depth = 0 - prev_was_ws = False - for token in tokens: - # if 1: - # print "Process", token, - # if token.at_line_start: - # print "at_line_start", - # if token.must_indent: - # print "must_indent", - # print - - # WS only occurs at the start of the line - # There may be WS followed by NEWLINE so - # only track the depth here. Don't indent/dedent - # until there's something real. - if token.type == "WS": - assert depth == 0 - depth = len(token.value) - prev_was_ws = True - # WS tokens are never passed to the parser - continue - - if token.type == "NEWLINE": - depth = 0 - if prev_was_ws or token.at_line_start: - # ignore blank lines - continue - # pass the other cases on through - yield token - continue - - # then it must be a real token (not WS, not NEWLINE) - # which can affect the indentation level - - prev_was_ws = False - if token.must_indent: - # The current depth must be larger than the previous level - if not (depth > levels[-1]): - raise IndentationError("expected an indented block") - - levels.append(depth) - yield INDENT(token.lineno) - - elif token.at_line_start: - # Must be on the same level or one of the previous levels - if depth == levels[-1]: - # At the same level - pass - elif depth > levels[-1]: - raise IndentationError( - "indentation increase but not in new block") - else: - # Back up; but only if it matches a previous level - try: - i = levels.index(depth) - except ValueError: - raise IndentationError("inconsistent indentation") - for _ in range(i + 1, len(levels)): - yield DEDENT(token.lineno) - levels.pop() - - yield token - - ### Finished processing ### - - # Must dedent any remaining levels - if len(levels) > 1: - assert token is not None - for _ in range(1, len(levels)): - yield DEDENT(token.lineno) - - -# The top-level filter adds an ENDMARKER, if requested. -# Python's grammar uses it. -def filter(lexer, add_endmarker=True): - token = None - tokens = iter(lexer.token, None) - tokens = track_tokens_filter(lexer, tokens) - for token in indentation_filter(tokens): - yield token - - if add_endmarker: - lineno = 1 - if token is not None: - lineno = token.lineno - yield _new_token("ENDMARKER", lineno) - -# Combine Ply and my filters into a new lexer - - -class IndentLexer(object): - - def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0): - self.lexer = lex.lex(debug=debug, optimize=optimize, - lextab=lextab, reflags=reflags) - self.token_stream = None - - def input(self, s, add_endmarker=True): - self.lexer.paren_count = 0 - self.lexer.input(s) - self.token_stream = filter(self.lexer, add_endmarker) - - def token(self): - try: - return self.token_stream.next() - except StopIteration: - return None - -########## Parser (tokens -> AST) ###### - -# also part of Ply -#import yacc - -# I use the Python AST -from compiler import ast - -# Helper function - - -def Assign(left, right): - names = [] - if isinstance(left, ast.Name): - # Single assignment on left - return ast.Assign([ast.AssName(left.name, 'OP_ASSIGN')], right) - elif isinstance(left, ast.Tuple): - # List of things - make sure they are Name nodes - names = [] - for child in left.getChildren(): - if not isinstance(child, ast.Name): - raise SyntaxError("that assignment not supported") - names.append(child.name) - ass_list = [ast.AssName(name, 'OP_ASSIGN') for name in names] - return ast.Assign([ast.AssTuple(ass_list)], right) - else: - raise SyntaxError("Can't do that yet") - - -# The grammar comments come from Python's Grammar/Grammar file - -# NB: compound_stmt in single_input is followed by extra NEWLINE! -# file_input: (NEWLINE | stmt)* ENDMARKER -def p_file_input_end(p): - """file_input_end : file_input ENDMARKER""" - p[0] = ast.Stmt(p[1]) - - -def p_file_input(p): - """file_input : file_input NEWLINE - | file_input stmt - | NEWLINE - | stmt""" - if isinstance(p[len(p) - 1], basestring): - if len(p) == 3: - p[0] = p[1] - else: - p[0] = [] # p == 2 --> only a blank line - else: - if len(p) == 3: - p[0] = p[1] + p[2] - else: - p[0] = p[1] - - -# funcdef: [decorators] 'def' NAME parameters ':' suite -# ignoring decorators -def p_funcdef(p): - "funcdef : DEF NAME parameters COLON suite" - p[0] = ast.Function(None, p[2], tuple(p[3]), (), 0, None, p[5]) - -# parameters: '(' [varargslist] ')' - - -def p_parameters(p): - """parameters : LPAR RPAR - | LPAR varargslist RPAR""" - if len(p) == 3: - p[0] = [] - else: - p[0] = p[2] - - -# varargslist: (fpdef ['=' test] ',')* ('*' NAME [',' '**' NAME] | '**' NAME) | -# highly simplified -def p_varargslist(p): - """varargslist : varargslist COMMA NAME - | NAME""" - if len(p) == 4: - p[0] = p[1] + p[3] - else: - p[0] = [p[1]] - -# stmt: simple_stmt | compound_stmt - - -def p_stmt_simple(p): - """stmt : simple_stmt""" - # simple_stmt is a list - p[0] = p[1] - - -def p_stmt_compound(p): - """stmt : compound_stmt""" - p[0] = [p[1]] - -# simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE - - -def p_simple_stmt(p): - """simple_stmt : small_stmts NEWLINE - | small_stmts SEMICOLON NEWLINE""" - p[0] = p[1] - - -def p_small_stmts(p): - """small_stmts : small_stmts SEMICOLON small_stmt - | small_stmt""" - if len(p) == 4: - p[0] = p[1] + [p[3]] - else: - p[0] = [p[1]] - -# small_stmt: expr_stmt | print_stmt | del_stmt | pass_stmt | flow_stmt | -# import_stmt | global_stmt | exec_stmt | assert_stmt - - -def p_small_stmt(p): - """small_stmt : flow_stmt - | expr_stmt""" - p[0] = p[1] - -# expr_stmt: testlist (augassign (yield_expr|testlist) | -# ('=' (yield_expr|testlist))*) -# augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' | -# '<<=' | '>>=' | '**=' | '//=') - - -def p_expr_stmt(p): - """expr_stmt : testlist ASSIGN testlist - | testlist """ - if len(p) == 2: - # a list of expressions - p[0] = ast.Discard(p[1]) - else: - p[0] = Assign(p[1], p[3]) - - -def p_flow_stmt(p): - "flow_stmt : return_stmt" - p[0] = p[1] - -# return_stmt: 'return' [testlist] - - -def p_return_stmt(p): - "return_stmt : RETURN testlist" - p[0] = ast.Return(p[2]) - - -def p_compound_stmt(p): - """compound_stmt : if_stmt - | funcdef""" - p[0] = p[1] - - -def p_if_stmt(p): - 'if_stmt : IF test COLON suite' - p[0] = ast.If([(p[2], p[4])], None) - - -def p_suite(p): - """suite : simple_stmt - | NEWLINE INDENT stmts DEDENT""" - if len(p) == 2: - p[0] = ast.Stmt(p[1]) - else: - p[0] = ast.Stmt(p[3]) - - -def p_stmts(p): - """stmts : stmts stmt - | stmt""" - if len(p) == 3: - p[0] = p[1] + p[2] - else: - p[0] = p[1] - -# No using Python's approach because Ply supports precedence - -# comparison: expr (comp_op expr)* -# arith_expr: term (('+'|'-') term)* -# term: factor (('*'|'/'|'%'|'//') factor)* -# factor: ('+'|'-'|'~') factor | power -# comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not' - - -def make_lt_compare((left, right)): - return ast.Compare(left, [('<', right), ]) - - -def make_gt_compare((left, right)): - return ast.Compare(left, [('>', right), ]) - - -def make_eq_compare((left, right)): - return ast.Compare(left, [('==', right), ]) - - -binary_ops = { - "+": ast.Add, - "-": ast.Sub, - "*": ast.Mul, - "/": ast.Div, - "<": make_lt_compare, - ">": make_gt_compare, - "==": make_eq_compare, -} -unary_ops = { - "+": ast.UnaryAdd, - "-": ast.UnarySub, -} -precedence = ( - ("left", "EQ", "GT", "LT"), - ("left", "PLUS", "MINUS"), - ("left", "MULT", "DIV"), -) - - -def p_comparison(p): - """comparison : comparison PLUS comparison - | comparison MINUS comparison - | comparison MULT comparison - | comparison DIV comparison - | comparison LT comparison - | comparison EQ comparison - | comparison GT comparison - | PLUS comparison - | MINUS comparison - | power""" - if len(p) == 4: - p[0] = binary_ops[p[2]]((p[1], p[3])) - elif len(p) == 3: - p[0] = unary_ops[p[1]](p[2]) - else: - p[0] = p[1] - -# power: atom trailer* ['**' factor] -# trailers enables function calls. I only allow one level of calls -# so this is 'trailer' - - -def p_power(p): - """power : atom - | atom trailer""" - if len(p) == 2: - p[0] = p[1] - else: - if p[2][0] == "CALL": - p[0] = ast.CallFunc(p[1], p[2][1], None, None) - else: - raise AssertionError("not implemented") - - -def p_atom_name(p): - """atom : NAME""" - p[0] = ast.Name(p[1]) - - -def p_atom_number(p): - """atom : NUMBER - | STRING""" - p[0] = ast.Const(p[1]) - - -def p_atom_tuple(p): - """atom : LPAR testlist RPAR""" - p[0] = p[2] - -# trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME - - -def p_trailer(p): - "trailer : LPAR arglist RPAR" - p[0] = ("CALL", p[2]) - -# testlist: test (',' test)* [','] -# Contains shift/reduce error - - -def p_testlist(p): - """testlist : testlist_multi COMMA - | testlist_multi """ - if len(p) == 2: - p[0] = p[1] - else: - # May need to promote singleton to tuple - if isinstance(p[1], list): - p[0] = p[1] - else: - p[0] = [p[1]] - # Convert into a tuple? - if isinstance(p[0], list): - p[0] = ast.Tuple(p[0]) - - -def p_testlist_multi(p): - """testlist_multi : testlist_multi COMMA test - | test""" - if len(p) == 2: - # singleton - p[0] = p[1] - else: - if isinstance(p[1], list): - p[0] = p[1] + [p[3]] - else: - # singleton -> tuple - p[0] = [p[1], p[3]] - - -# test: or_test ['if' or_test 'else' test] | lambdef -# as I don't support 'and', 'or', and 'not' this works down to 'comparison' -def p_test(p): - "test : comparison" - p[0] = p[1] - - -# arglist: (argument ',')* (argument [',']| '*' test [',' '**' test] | '**' test) -# XXX INCOMPLETE: this doesn't allow the trailing comma -def p_arglist(p): - """arglist : arglist COMMA argument - | argument""" - if len(p) == 4: - p[0] = p[1] + [p[3]] - else: - p[0] = [p[1]] - -# argument: test [gen_for] | test '=' test # Really [keyword '='] test - - -def p_argument(p): - "argument : test" - p[0] = p[1] - - -def p_error(p): - # print "Error!", repr(p) - raise SyntaxError(p) - - -class GardenSnakeParser(object): - - def __init__(self, lexer=None): - if lexer is None: - lexer = IndentLexer() - self.lexer = lexer - self.parser = yacc.yacc(start="file_input_end") - - def parse(self, code): - self.lexer.input(code) - result = self.parser.parse(lexer=self.lexer) - return ast.Module(None, result) - - -###### Code generation ###### - -from compiler import misc, syntax, pycodegen - - -class GardenSnakeCompiler(object): - - def __init__(self): - self.parser = GardenSnakeParser() - - def compile(self, code, filename=""): - tree = self.parser.parse(code) - # print tree - misc.set_filename(filename, tree) - syntax.check(tree) - gen = pycodegen.ModuleCodeGenerator(tree) - code = gen.getCode() - return code - -####### Test code ####### - -compile = GardenSnakeCompiler().compile - -code = r""" - -print('LET\'S TRY THIS \\OUT') - -#Comment here -def x(a): - print('called with',a) - if a == 1: - return 2 - if a*2 > 10: return 999 / 4 - # Another comment here - - return a+2*3 - -ints = (1, 2, - 3, 4, -5) -print('mutiline-expression', ints) - -t = 4+1/3*2+6*(9-5+1) -print('predence test; should be 34+2/3:', t, t==(34+2/3)) - -print('numbers', 1,2,3,4,5) -if 1: - 8 - a=9 - print(x(a)) - -print(x(1)) -print(x(2)) -print(x(8),'3') -print('this is decimal', 1/5) -print('BIG DECIMAL', 1.234567891234567e12345) - -""" - -# Set up the GardenSnake run-time environment - - -def print_(*args): - print "-->", " ".join(map(str, args)) - -globals()["print"] = print_ - -compiled_code = compile(code) - -exec compiled_code in globals() -print "Done" diff --git a/example/GardenSnake/README b/example/GardenSnake/README deleted file mode 100644 index 4d8be2d..0000000 --- a/example/GardenSnake/README +++ /dev/null @@ -1,5 +0,0 @@ -This example is Andrew Dalke's GardenSnake language. It shows how to process an -indentation-like language like Python. Further details can be found here: - -http://dalkescientific.com/writings/diary/archive/2006/08/30/gardensnake_language.html - diff --git a/example/README b/example/README index 63519b5..a7ec6e8 100644 --- a/example/README +++ b/example/README @@ -5,6 +5,5 @@ Simple examples: Complex examples ansic - ANSI C grammar from K&R BASIC - A small BASIC interpreter - GardenSnake - A simple python-like language yply - Converts Unix yacc files to PLY programs. diff --git a/example/calc/calc.py b/example/calc/calc.py index 824c3d7..406d83c 100644 --- a/example/calc/calc.py +++ b/example/calc/calc.py @@ -8,9 +8,6 @@ import sys sys.path.insert(0, "../..") -if sys.version_info[0] >= 3: - raw_input = input - tokens = ( 'NAME', 'NUMBER', ) @@ -29,19 +26,17 @@ def t_NUMBER(t): t_ignore = " \t" - def t_newline(t): r'\n+' t.lexer.lineno += t.value.count("\n") - def t_error(t): print("Illegal character '%s'" % t.value[0]) t.lexer.skip(1) # Build the lexer import ply.lex as lex -lex.lex() +lexer = lex.lex() # Parsing rules @@ -54,7 +49,6 @@ def t_error(t): # dictionary of names names = {} - def p_statement_assign(p): 'statement : NAME "=" expression' names[p[1]] = p[3] @@ -111,11 +105,11 @@ def p_error(p): print("Syntax error at EOF") import ply.yacc as yacc -yacc.yacc() +parser = yacc.yacc() -while 1: +while True: try: - s = raw_input('calc > ') + s = input('calc > ') except EOFError: break if not s: diff --git a/example/calcdebug/calc.py b/example/calcdebug/calc.py index 06831e2..386000e 100644 --- a/example/calcdebug/calc.py +++ b/example/calcdebug/calc.py @@ -8,9 +8,6 @@ import sys sys.path.insert(0, "../..") -if sys.version_info[0] >= 3: - raw_input = input - tokens = ( 'NAME', 'NUMBER', ) @@ -119,9 +116,9 @@ def p_error(p): filename="parselog.txt" ) -while 1: +while True: try: - s = raw_input('calc > ') + s = input('calc > ') except EOFError: break if not s: diff --git a/example/calceof/calc.py b/example/calceof/calc.py index 22b39a4..7bb7e0f 100644 --- a/example/calceof/calc.py +++ b/example/calceof/calc.py @@ -8,9 +8,6 @@ import sys sys.path.insert(0, "../..") -if sys.version_info[0] >= 3: - raw_input = input - tokens = ( 'NAME', 'NUMBER', ) @@ -36,7 +33,7 @@ def t_newline(t): def t_eof(t): - more = raw_input('... ') + more = input('... ') if more: t.lexer.input(more + '\n') return t.lexer.token() @@ -122,9 +119,9 @@ def p_error(p): import ply.yacc as yacc yacc.yacc() -while 1: +while True: try: - s = raw_input('calc > ') + s = input('calc > ') except EOFError: break if not s: diff --git a/example/classcalc/calc.py b/example/classcalc/calc.py index ada4afd..6f35195 100755 --- a/example/classcalc/calc.py +++ b/example/classcalc/calc.py @@ -12,9 +12,6 @@ import sys sys.path.insert(0, "../..") -if sys.version_info[0] >= 3: - raw_input = input - import ply.lex as lex import ply.yacc as yacc import os @@ -36,20 +33,18 @@ def __init__(self, **kw): except: modname = "parser" + "_" + self.__class__.__name__ self.debugfile = modname + ".dbg" - self.tabmodule = modname + "_" + "parsetab" - # print self.debugfile, self.tabmodule + # print self.debugfile # Build the lexer and parser lex.lex(module=self, debug=self.debug) yacc.yacc(module=self, debug=self.debug, - debugfile=self.debugfile, - tabmodule=self.tabmodule) + debugfile=self.debugfile) def run(self): - while 1: + while True: try: - s = raw_input('calc > ') + s = input('calc > ') except EOFError: break if not s: diff --git a/example/closurecalc/calc.py b/example/closurecalc/calc.py index 6031b05..59c9d6f 100644 --- a/example/closurecalc/calc.py +++ b/example/closurecalc/calc.py @@ -9,9 +9,6 @@ import sys sys.path.insert(0, "../..") -if sys.version_info[0] >= 3: - raw_input = input - # Make a calculator function @@ -124,7 +121,7 @@ def input(text): while True: try: - s = raw_input("calc > ") + s = input("calc > ") except EOFError: break r = calc(s) diff --git a/example/hedit/hedit.py b/example/hedit/hedit.py deleted file mode 100644 index 32da745..0000000 --- a/example/hedit/hedit.py +++ /dev/null @@ -1,48 +0,0 @@ -# ----------------------------------------------------------------------------- -# hedit.py -# -# Paring of Fortran H Edit descriptions (Contributed by Pearu Peterson) -# -# These tokens can't be easily tokenized because they are of the following -# form: -# -# nHc1...cn -# -# where n is a positive integer and c1 ... cn are characters. -# -# This example shows how to modify the state of the lexer to parse -# such tokens -# ----------------------------------------------------------------------------- - -import sys -sys.path.insert(0, "../..") - - -tokens = ( - 'H_EDIT_DESCRIPTOR', -) - -# Tokens -t_ignore = " \t\n" - - -def t_H_EDIT_DESCRIPTOR(t): - r"\d+H.*" # This grabs all of the remaining text - i = t.value.index('H') - n = eval(t.value[:i]) - - # Adjust the tokenizing position - t.lexer.lexpos -= len(t.value) - (i + 1 + n) - - t.value = t.value[i + 1:i + 1 + n] - return t - - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -import ply.lex as lex -lex.lex() -lex.runmain() diff --git a/example/newclasscalc/calc.py b/example/newclasscalc/calc.py deleted file mode 100755 index 43c9506..0000000 --- a/example/newclasscalc/calc.py +++ /dev/null @@ -1,167 +0,0 @@ -#!/usr/bin/env python - -# ----------------------------------------------------------------------------- -# calc.py -# -# A simple calculator with variables. This is from O'Reilly's -# "Lex and Yacc", p. 63. -# -# Class-based example contributed to PLY by David McNab. -# -# Modified to use new-style classes. Test case. -# ----------------------------------------------------------------------------- - -import sys -sys.path.insert(0, "../..") - -if sys.version_info[0] >= 3: - raw_input = input - -import ply.lex as lex -import ply.yacc as yacc -import os - - -class Parser(object): - """ - Base class for a lexer/parser that has the rules defined as methods - """ - tokens = () - precedence = () - - def __init__(self, **kw): - self.debug = kw.get('debug', 0) - self.names = {} - try: - modname = os.path.split(os.path.splitext(__file__)[0])[ - 1] + "_" + self.__class__.__name__ - except: - modname = "parser" + "_" + self.__class__.__name__ - self.debugfile = modname + ".dbg" - self.tabmodule = modname + "_" + "parsetab" - # print self.debugfile, self.tabmodule - - # Build the lexer and parser - lex.lex(module=self, debug=self.debug) - yacc.yacc(module=self, - debug=self.debug, - debugfile=self.debugfile, - tabmodule=self.tabmodule) - - def run(self): - while 1: - try: - s = raw_input('calc > ') - except EOFError: - break - if not s: - continue - yacc.parse(s) - - -class Calc(Parser): - - tokens = ( - 'NAME', 'NUMBER', - 'PLUS', 'MINUS', 'EXP', 'TIMES', 'DIVIDE', 'EQUALS', - 'LPAREN', 'RPAREN', - ) - - # Tokens - - t_PLUS = r'\+' - t_MINUS = r'-' - t_EXP = r'\*\*' - t_TIMES = r'\*' - t_DIVIDE = r'/' - t_EQUALS = r'=' - t_LPAREN = r'\(' - t_RPAREN = r'\)' - t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - - def t_NUMBER(self, t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - # print "parsed number %s" % repr(t.value) - return t - - t_ignore = " \t" - - def t_newline(self, t): - r'\n+' - t.lexer.lineno += t.value.count("\n") - - def t_error(self, t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - - # Parsing rules - - precedence = ( - ('left', 'PLUS', 'MINUS'), - ('left', 'TIMES', 'DIVIDE'), - ('left', 'EXP'), - ('right', 'UMINUS'), - ) - - def p_statement_assign(self, p): - 'statement : NAME EQUALS expression' - self.names[p[1]] = p[3] - - def p_statement_expr(self, p): - 'statement : expression' - print(p[1]) - - def p_expression_binop(self, p): - """ - expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression - | expression EXP expression - """ - # print [repr(p[i]) for i in range(0,4)] - if p[2] == '+': - p[0] = p[1] + p[3] - elif p[2] == '-': - p[0] = p[1] - p[3] - elif p[2] == '*': - p[0] = p[1] * p[3] - elif p[2] == '/': - p[0] = p[1] / p[3] - elif p[2] == '**': - p[0] = p[1] ** p[3] - - def p_expression_uminus(self, p): - 'expression : MINUS expression %prec UMINUS' - p[0] = -p[2] - - def p_expression_group(self, p): - 'expression : LPAREN expression RPAREN' - p[0] = p[2] - - def p_expression_number(self, p): - 'expression : NUMBER' - p[0] = p[1] - - def p_expression_name(self, p): - 'expression : NAME' - try: - p[0] = self.names[p[1]] - except LookupError: - print("Undefined name '%s'" % p[1]) - p[0] = 0 - - def p_error(self, p): - if p: - print("Syntax error at '%s'" % p.value) - else: - print("Syntax error at EOF") - -if __name__ == '__main__': - calc = Calc() - calc.run() diff --git a/example/optcalc/README b/example/optcalc/README deleted file mode 100644 index 53dd5fc..0000000 --- a/example/optcalc/README +++ /dev/null @@ -1,9 +0,0 @@ -An example showing how to use Python optimized mode. -To run: - - - First run 'python calc.py' - - - Then run 'python -OO calc.py' - -If working correctly, the second version should run the -same way. diff --git a/example/optcalc/calc.py b/example/optcalc/calc.py deleted file mode 100644 index 0c223e5..0000000 --- a/example/optcalc/calc.py +++ /dev/null @@ -1,134 +0,0 @@ -# ----------------------------------------------------------------------------- -# calc.py -# -# A simple calculator with variables. This is from O'Reilly's -# "Lex and Yacc", p. 63. -# ----------------------------------------------------------------------------- - -import sys -sys.path.insert(0, "../..") - -if sys.version_info[0] >= 3: - raw_input = input - -tokens = ( - 'NAME', 'NUMBER', - 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'EQUALS', - 'LPAREN', 'RPAREN', -) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - - -def t_newline(t): - r'\n+' - t.lexer.lineno += t.value.count("\n") - - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -import ply.lex as lex -lex.lex(optimize=1) - -# Parsing rules - -precedence = ( - ('left', 'PLUS', 'MINUS'), - ('left', 'TIMES', 'DIVIDE'), - ('right', 'UMINUS'), -) - -# dictionary of names -names = {} - - -def p_statement_assign(t): - 'statement : NAME EQUALS expression' - names[t[1]] = t[3] - - -def p_statement_expr(t): - 'statement : expression' - print(t[1]) - - -def p_expression_binop(t): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - if t[2] == '+': - t[0] = t[1] + t[3] - elif t[2] == '-': - t[0] = t[1] - t[3] - elif t[2] == '*': - t[0] = t[1] * t[3] - elif t[2] == '/': - t[0] = t[1] / t[3] - elif t[2] == '<': - t[0] = t[1] < t[3] - - -def p_expression_uminus(t): - 'expression : MINUS expression %prec UMINUS' - t[0] = -t[2] - - -def p_expression_group(t): - 'expression : LPAREN expression RPAREN' - t[0] = t[2] - - -def p_expression_number(t): - 'expression : NUMBER' - t[0] = t[1] - - -def p_expression_name(t): - 'expression : NAME' - try: - t[0] = names[t[1]] - except LookupError: - print("Undefined name '%s'" % t[1]) - t[0] = 0 - - -def p_error(t): - if t: - print("Syntax error at '%s'" % t.value) - else: - print("Syntax error at EOF") - -import ply.yacc as yacc -yacc.yacc(optimize=1) - -while 1: - try: - s = raw_input('calc > ') - except EOFError: - break - yacc.parse(s) diff --git a/example/unicalc/calc.py b/example/unicalc/calc.py deleted file mode 100644 index 901c4b9..0000000 --- a/example/unicalc/calc.py +++ /dev/null @@ -1,133 +0,0 @@ -# ----------------------------------------------------------------------------- -# calc.py -# -# A simple calculator with variables. This is from O'Reilly's -# "Lex and Yacc", p. 63. -# -# This example uses unicode strings for tokens, docstrings, and input. -# ----------------------------------------------------------------------------- - -import sys -sys.path.insert(0, "../..") - -tokens = ( - 'NAME', 'NUMBER', - 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'EQUALS', - 'LPAREN', 'RPAREN', -) - -# Tokens - -t_PLUS = ur'\+' -t_MINUS = ur'-' -t_TIMES = ur'\*' -t_DIVIDE = ur'/' -t_EQUALS = ur'=' -t_LPAREN = ur'\(' -t_RPAREN = ur'\)' -t_NAME = ur'[a-zA-Z_][a-zA-Z0-9_]*' - - -def t_NUMBER(t): - ur'\d+' - try: - t.value = int(t.value) - except ValueError: - print "Integer value too large", t.value - t.value = 0 - return t - -t_ignore = u" \t" - - -def t_newline(t): - ur'\n+' - t.lexer.lineno += t.value.count("\n") - - -def t_error(t): - print "Illegal character '%s'" % t.value[0] - t.lexer.skip(1) - -# Build the lexer -import ply.lex as lex -lex.lex() - -# Parsing rules - -precedence = ( - ('left', 'PLUS', 'MINUS'), - ('left', 'TIMES', 'DIVIDE'), - ('right', 'UMINUS'), -) - -# dictionary of names -names = {} - - -def p_statement_assign(p): - 'statement : NAME EQUALS expression' - names[p[1]] = p[3] - - -def p_statement_expr(p): - 'statement : expression' - print p[1] - - -def p_expression_binop(p): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - if p[2] == u'+': - p[0] = p[1] + p[3] - elif p[2] == u'-': - p[0] = p[1] - p[3] - elif p[2] == u'*': - p[0] = p[1] * p[3] - elif p[2] == u'/': - p[0] = p[1] / p[3] - - -def p_expression_uminus(p): - 'expression : MINUS expression %prec UMINUS' - p[0] = -p[2] - - -def p_expression_group(p): - 'expression : LPAREN expression RPAREN' - p[0] = p[2] - - -def p_expression_number(p): - 'expression : NUMBER' - p[0] = p[1] - - -def p_expression_name(p): - 'expression : NAME' - try: - p[0] = names[p[1]] - except LookupError: - print "Undefined name '%s'" % p[1] - p[0] = 0 - - -def p_error(p): - if p: - print "Syntax error at '%s'" % p.value - else: - print "Syntax error at EOF" - -import ply.yacc as yacc -yacc.yacc() - -while 1: - try: - s = raw_input('calc > ') - except EOFError: - break - if not s: - continue - yacc.parse(unicode(s)) diff --git a/example/yply/yparse.py b/example/yply/yparse.py index 1f2e8d0..b2c8863 100644 --- a/example/yply/yparse.py +++ b/example/yply/yparse.py @@ -233,7 +233,7 @@ def p_empty(p): def p_error(p): pass -yacc.yacc(debug=0) +yacc.yacc(debug=False) def print_code(code, indent): diff --git a/ply/__init__.py b/ply/__init__.py index 6f768b7..8783862 100644 --- a/ply/__init__.py +++ b/ply/__init__.py @@ -1,5 +1,6 @@ # PLY package # Author: David Beazley (dave@dabeaz.com) +# https://dabeaz.com/ply/index.html __version__ = '4.0' __all__ = ['lex','yacc'] diff --git a/ply/lex.py b/ply/lex.py index 39095eb..3b670ef 100644 --- a/ply/lex.py +++ b/ply/lex.py @@ -33,9 +33,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ----------------------------------------------------------------------------- -__version__ = '4.0' -__tabversion__ = '3.10' - import re import sys import types @@ -56,15 +53,10 @@ def __init__(self, message, s): self.args = (message,) self.text = s - # Token class. This class is used to represent the tokens produced. class LexToken(object): - def __str__(self): - return 'LexToken(%s,%r,%d,%d)' % (self.type, self.value, self.lineno, self.lexpos) - def __repr__(self): - return str(self) - + return f'LexToken({self.type},{self.value!r},{self.lineno},{self.lexpos})' # This object is a stand-in for a logging object created by the # logging module. @@ -85,16 +77,6 @@ def error(self, msg, *args, **kwargs): info = critical debug = critical - -# Null logger is used when no output is generated. Does nothing. -class NullLogger(object): - def __getattribute__(self, name): - return self - - def __call__(self, *args, **kwargs): - return self - - # ----------------------------------------------------------------------------- # === Lexing Engine === # @@ -136,7 +118,6 @@ def __init__(self): self.lexliterals = '' # Literal characters that can be passed through self.lexmodule = None # Module self.lineno = 1 # Current line number - self.lexoptimize = False # Optimized mode def clone(self, object=None): c = copy.copy(self) @@ -165,91 +146,10 @@ def clone(self, object=None): c.lexmodule = object return c - # ------------------------------------------------------------ - # writetab() - Write lexer information to a table file - # ------------------------------------------------------------ - def writetab(self, lextab, outputdir=''): - if isinstance(lextab, types.ModuleType): - raise IOError("Won't overwrite existing lextab module") - basetabmodule = lextab.split('.')[-1] - filename = os.path.join(outputdir, basetabmodule) + '.py' - with open(filename, 'w') as tf: - tf.write('# %s.py. This file automatically created by PLY (version %s). Don\'t edit!\n' % (basetabmodule, __version__)) - tf.write('_tabversion = %s\n' % repr(__tabversion__)) - tf.write('_lextokens = set(%s)\n' % repr(tuple(sorted(self.lextokens)))) - tf.write('_lexreflags = %s\n' % repr(int(self.lexreflags))) - tf.write('_lexliterals = %s\n' % repr(self.lexliterals)) - tf.write('_lexstateinfo = %s\n' % repr(self.lexstateinfo)) - - # Rewrite the lexstatere table, replacing function objects with function names - tabre = {} - for statename, lre in self.lexstatere.items(): - titem = [] - for (pat, func), retext, renames in zip(lre, self.lexstateretext[statename], self.lexstaterenames[statename]): - titem.append((retext, _funcs_to_names(func, renames))) - tabre[statename] = titem - - tf.write('_lexstatere = %s\n' % repr(tabre)) - tf.write('_lexstateignore = %s\n' % repr(self.lexstateignore)) - - taberr = {} - for statename, ef in self.lexstateerrorf.items(): - taberr[statename] = ef.__name__ if ef else None - tf.write('_lexstateerrorf = %s\n' % repr(taberr)) - - tabeof = {} - for statename, ef in self.lexstateeoff.items(): - tabeof[statename] = ef.__name__ if ef else None - tf.write('_lexstateeoff = %s\n' % repr(tabeof)) - - # ------------------------------------------------------------ - # readtab() - Read lexer information from a tab file - # ------------------------------------------------------------ - def readtab(self, tabfile, fdict): - if isinstance(tabfile, types.ModuleType): - lextab = tabfile - else: - exec('import %s' % tabfile) - lextab = sys.modules[tabfile] - - if getattr(lextab, '_tabversion', '0.0') != __tabversion__: - raise ImportError('Inconsistent PLY version') - - self.lextokens = lextab._lextokens - self.lexreflags = lextab._lexreflags - self.lexliterals = lextab._lexliterals - self.lextokens_all = self.lextokens | set(self.lexliterals) - self.lexstateinfo = lextab._lexstateinfo - self.lexstateignore = lextab._lexstateignore - self.lexstatere = {} - self.lexstateretext = {} - for statename, lre in lextab._lexstatere.items(): - titem = [] - txtitem = [] - for pat, func_name in lre: - titem.append((re.compile(pat, lextab._lexreflags), _names_to_funcs(func_name, fdict))) - - self.lexstatere[statename] = titem - self.lexstateretext[statename] = txtitem - - self.lexstateerrorf = {} - for statename, ef in lextab._lexstateerrorf.items(): - self.lexstateerrorf[statename] = fdict[ef] - - self.lexstateeoff = {} - for statename, ef in lextab._lexstateeoff.items(): - self.lexstateeoff[statename] = fdict[ef] - - self.begin('INITIAL') - # ------------------------------------------------------------ # input() - Push a new string into the lexer # ------------------------------------------------------------ def input(self, s): - # Pull off the first character to see if s looks like a string - c = s[:1] - if not isinstance(c, StringTypes): - raise ValueError('Expected a string') self.lexdata = s self.lexpos = 0 self.lexlen = len(s) @@ -259,7 +159,7 @@ def input(self, s): # ------------------------------------------------------------ def begin(self, state): if state not in self.lexstatere: - raise ValueError('Undefined state') + raise ValueError(f'Undefined state {state!r}') self.lexre = self.lexstatere[state] self.lexretext = self.lexstateretext[state] self.lexignore = self.lexstateignore.get(state, '') @@ -293,7 +193,7 @@ def skip(self, n): self.lexpos += n # ------------------------------------------------------------ - # opttoken() - Return the next token from the Lexer + # token() - Return the next token from the Lexer # # Note: This function has been carefully implemented to be as fast # as possible. Don't make changes unless you really know what @@ -343,22 +243,15 @@ def token(self): tok.lexer = self # Set additional attributes useful in token rules self.lexmatch = m self.lexpos = lexpos - newtok = func(tok) + del tok.lexer + del self.lexmatch # Every function must return a token, if nothing, we just move to next token if not newtok: lexpos = self.lexpos # This is here in case user has updated lexpos. lexignore = self.lexignore # This is here in case there was a state change break - - # Verify type of the token. If not in the token map, raise an error - if not self.lexoptimize: - if newtok.type not in self.lextokens_all: - raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( - func.__code__.co_filename, func.__code__.co_firstlineno, - func.__name__, newtok.type), lexdata[lexpos:]) - return newtok else: # No match, see if in literals @@ -383,14 +276,16 @@ def token(self): newtok = self.lexerrorf(tok) if lexpos == self.lexpos: # Error method didn't change text position at all. This is an error. - raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) + raise LexError(f"Scanning error. Illegal character {lexdata[lexpos]!r}", + lexdata[lexpos:]) lexpos = self.lexpos if not newtok: continue return newtok self.lexpos = lexpos - raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos], lexpos), lexdata[lexpos:]) + raise LexError(f"Illegal character {lexdata[lexpos]!r} at index {lexpos}", + lexdata[lexpos:]) if self.lexeoff: tok = LexToken() @@ -412,14 +307,12 @@ def token(self): def __iter__(self): return self - def next(self): + def __next__(self): t = self.token() if t is None: raise StopIteration return t - __next__ = next - # ----------------------------------------------------------------------------- # ==== Lex Builder === # @@ -445,40 +338,7 @@ def _get_regex(func): # ----------------------------------------------------------------------------- def get_caller_module_dict(levels): f = sys._getframe(levels) - ldict = f.f_globals.copy() - if f.f_globals != f.f_locals: - ldict.update(f.f_locals) - return ldict - -# ----------------------------------------------------------------------------- -# _funcs_to_names() -# -# Given a list of regular expression functions, this converts it to a list -# suitable for output to a table file -# ----------------------------------------------------------------------------- -def _funcs_to_names(funclist, namelist): - result = [] - for f, name in zip(funclist, namelist): - if f and f[0]: - result.append((name, f[1])) - else: - result.append(f) - return result - -# ----------------------------------------------------------------------------- -# _names_to_funcs() -# -# Given a list of regular expression function names, this converts it back to -# functions. -# ----------------------------------------------------------------------------- -def _names_to_funcs(namelist, fdict): - result = [] - for n in namelist: - if n and n[0]: - result.append((fdict[n[0]], n[1])) - else: - result.append(n) - return result + return { **f.f_globals, **f.f_locals } # ----------------------------------------------------------------------------- # _form_master_re() @@ -489,7 +349,7 @@ def _names_to_funcs(namelist, fdict): # ----------------------------------------------------------------------------- def _form_master_re(relist, reflags, ldict, toknames): if not relist: - return [] + return [], [], [] regex = '|'.join(relist) try: lexre = re.compile(regex, reflags) @@ -512,9 +372,7 @@ def _form_master_re(relist, reflags, ldict, toknames): return [(lexre, lexindexfunc)], [regex], [lexindexnames] except Exception: - m = int(len(relist)/2) - if m == 0: - m = 1 + m = (len(relist) // 2) + 1 llist, lre, lnames = _form_master_re(relist[:m], reflags, ldict, toknames) rlist, rre, rnames = _form_master_re(relist[m:], reflags, ldict, toknames) return (llist+rlist), (lre+rre), (lnames+rnames) @@ -601,10 +459,10 @@ def validate_tokens(self): terminals = {} for n in self.tokens: if not _is_identifier.match(n): - self.log.error("Bad token name '%s'", n) + self.log.error(f"Bad token name {n!r}") self.error = True if n in terminals: - self.log.warning("Token '%s' multiply defined", n) + self.log.warning(f"Token {n!r} multiply defined") terminals[n] = 1 # Get the literals specifier @@ -618,7 +476,7 @@ def validate_literals(self): try: for c in self.literals: if not isinstance(c, StringTypes) or len(c) > 1: - self.log.error('Invalid literal %s. Must be a single character', repr(c)) + self.log.error(f'Invalid literal {c!r}. Must be a single character') self.error = True except TypeError: @@ -635,20 +493,20 @@ def get_states(self): else: for s in self.states: if not isinstance(s, tuple) or len(s) != 2: - self.log.error("Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')", repr(s)) + self.log.error("Invalid state specifier %r. Must be a tuple (statename,'exclusive|inclusive')", s) self.error = True continue name, statetype = s if not isinstance(name, StringTypes): - self.log.error('State name %s must be a string', repr(name)) + self.log.error('State name %r must be a string', name) self.error = True continue if not (statetype == 'inclusive' or statetype == 'exclusive'): - self.log.error("State type for state %s must be 'inclusive' or 'exclusive'", name) + self.log.error("State type for state %r must be 'inclusive' or 'exclusive'", name) self.error = True continue if name in self.stateinfo: - self.log.error("State '%s' already defined", name) + self.log.error("State %r already defined", name) self.error = True continue self.stateinfo[name] = statetype @@ -691,7 +549,7 @@ def get_rules(self): elif tokname == 'ignore': line = t.__code__.co_firstlineno file = t.__code__.co_filename - self.log.error("%s:%d: Rule '%s' must be defined as a string", file, line, t.__name__) + self.log.error("%s:%d: Rule %r must be defined as a string", file, line, t.__name__) self.error = True else: for s in states: @@ -704,7 +562,7 @@ def get_rules(self): self.log.warning("%s contains a literal backslash '\\'", f) elif tokname == 'error': - self.log.error("Rule '%s' must be defined as a function", f) + self.log.error("Rule %r must be defined as a function", f) self.error = True else: for s in states: @@ -739,57 +597,57 @@ def validate_rules(self): reqargs = 1 nargs = f.__code__.co_argcount if nargs > reqargs: - self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__) + self.log.error("%s:%d: Rule %r has too many arguments", file, line, f.__name__) self.error = True continue if nargs < reqargs: - self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__) + self.log.error("%s:%d: Rule %r requires an argument", file, line, f.__name__) self.error = True continue if not _get_regex(f): - self.log.error("%s:%d: No regular expression defined for rule '%s'", file, line, f.__name__) + self.log.error("%s:%d: No regular expression defined for rule %r", file, line, f.__name__) self.error = True continue try: c = re.compile('(?P<%s>%s)' % (fname, _get_regex(f)), self.reflags) if c.match(''): - self.log.error("%s:%d: Regular expression for rule '%s' matches empty string", file, line, f.__name__) + self.log.error("%s:%d: Regular expression for rule %r matches empty string", file, line, f.__name__) self.error = True except re.error as e: self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s", file, line, f.__name__, e) if '#' in _get_regex(f): - self.log.error("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'", file, line, f.__name__) + self.log.error("%s:%d. Make sure '#' in rule %r is escaped with '\\#'", file, line, f.__name__) self.error = True # Validate all rules defined by strings for name, r in self.strsym[state]: tokname = self.toknames[name] if tokname == 'error': - self.log.error("Rule '%s' must be defined as a function", name) + self.log.error("Rule %r must be defined as a function", name) self.error = True continue if tokname not in self.tokens and tokname.find('ignore_') < 0: - self.log.error("Rule '%s' defined for an unspecified token %s", name, tokname) + self.log.error("Rule %r defined for an unspecified token %s", name, tokname) self.error = True continue try: c = re.compile('(?P<%s>%s)' % (name, r), self.reflags) if (c.match('')): - self.log.error("Regular expression for rule '%s' matches empty string", name) + self.log.error("Regular expression for rule %r matches empty string", name) self.error = True except re.error as e: - self.log.error("Invalid regular expression for rule '%s'. %s", name, e) + self.log.error("Invalid regular expression for rule %r. %s", name, e) if '#' in r: - self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'", name) + self.log.error("Make sure '#' in rule %r is escaped with '\\#'", name) self.error = True if not self.funcsym[state] and not self.strsym[state]: - self.log.error("No rules defined for state '%s'", state) + self.log.error("No rules defined for state %r", state) self.error = True # Validate the error function @@ -807,11 +665,11 @@ def validate_rules(self): reqargs = 1 nargs = f.__code__.co_argcount if nargs > reqargs: - self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__) + self.log.error("%s:%d: Rule %r has too many arguments", file, line, f.__name__) self.error = True if nargs < reqargs: - self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__) + self.log.error("%s:%d: Rule %r requires an argument", file, line, f.__name__) self.error = True for module in self.modules: @@ -856,18 +714,14 @@ def validate_module(self, module): # # Build all of the regular expression rules from definitions in the supplied module # ----------------------------------------------------------------------------- -def lex(module=None, object=None, debug=False, optimize=False, lextab='lextab', - reflags=int(re.VERBOSE), nowarn=False, outputdir=None, debuglog=None, errorlog=None): - - if lextab is None: - lextab = 'lextab' +def lex(*, module=None, object=None, debug=False, + reflags=int(re.VERBOSE), debuglog=None, errorlog=None): global lexer ldict = None stateinfo = {'INITIAL': 'inclusive'} lexobj = Lexer() - lexobj.lexoptimize = optimize global token, input if errorlog is None: @@ -891,30 +745,11 @@ def lex(module=None, object=None, debug=False, optimize=False, lextab='lextab', else: ldict = get_caller_module_dict(2) - # Determine if the module is package of a package or not. - # If so, fix the tabmodule setting so that tables load correctly - pkg = ldict.get('__package__') - if pkg and isinstance(lextab, str): - if '.' not in lextab: - lextab = pkg + '.' + lextab - # Collect parser information from the dictionary linfo = LexerReflect(ldict, log=errorlog, reflags=reflags) linfo.get_all() - if not optimize: - if linfo.validate_all(): - raise SyntaxError("Can't build lexer") - - if optimize and lextab: - try: - lexobj.readtab(lextab, ldict) - token = lexobj.token - input = lexobj.input - lexer = lexobj - return lexobj - - except ImportError: - pass + if linfo.validate_all(): + raise SyntaxError("Can't build lexer") # Dump some basic debugging information if debug: @@ -1001,9 +836,9 @@ def lex(module=None, object=None, debug=False, optimize=False, lextab='lextab', for s, stype in stateinfo.items(): if stype == 'exclusive': if s not in linfo.errorf: - errorlog.warning("No error rule is defined for exclusive state '%s'", s) + errorlog.warning("No error rule is defined for exclusive state %r", s) if s not in linfo.ignore and lexobj.lexignore: - errorlog.warning("No ignore rule is defined for exclusive state '%s'", s) + errorlog.warning("No ignore rule is defined for exclusive state %r", s) elif stype == 'inclusive': if s not in linfo.errorf: linfo.errorf[s] = linfo.errorf.get('INITIAL', None) @@ -1015,31 +850,6 @@ def lex(module=None, object=None, debug=False, optimize=False, lextab='lextab', input = lexobj.input lexer = lexobj - # If in optimize mode, we write the lextab - if lextab and optimize: - if outputdir is None: - # If no output directory is set, the location of the output files - # is determined according to the following rules: - # - If lextab specifies a package, files go into that package directory - # - Otherwise, files go in the same directory as the specifying module - if isinstance(lextab, types.ModuleType): - srcfile = lextab.__file__ - else: - if '.' not in lextab: - srcfile = ldict['__file__'] - else: - parts = lextab.split('.') - pkgname = '.'.join(parts[:-1]) - exec('import %s' % pkgname) - srcfile = getattr(sys.modules[pkgname], '__file__', '') - outputdir = os.path.dirname(srcfile) - try: - lexobj.writetab(lextab, outputdir) - if lextab in sys.modules: - del sys.modules[lextab] - except IOError as e: - errorlog.warning("Couldn't write lextab module %r. %s" % (lextab, e)) - return lexobj # ----------------------------------------------------------------------------- @@ -1072,7 +882,7 @@ def runmain(lexer=None, data=None): tok = _token() if not tok: break - sys.stdout.write('(%s,%r,%d,%d)\n' % (tok.type, tok.value, tok.lineno, tok.lexpos)) + sys.stdout.write(f'({tok.type},{tok.value!r},{tok.lineno},{tok.lexpos})\n') # ----------------------------------------------------------------------------- # @TOKEN(regex) @@ -1089,6 +899,3 @@ def set_regex(f): f.regex = r return f return set_regex - -# Alternative spelling of the TOKEN decorator -Token = TOKEN diff --git a/ply/yacc.py b/ply/yacc.py index a5024eb..5a750d7 100644 --- a/ply/yacc.py +++ b/ply/yacc.py @@ -64,12 +64,7 @@ import re import types import sys -import os.path import inspect -import warnings - -__version__ = '4.0' -__tabversion__ = '3.10' #----------------------------------------------------------------------------- # === User configurable parameters === @@ -77,22 +72,13 @@ # Change these to modify the default behavior of yacc (if you wish) #----------------------------------------------------------------------------- -yaccdebug = True # Debugging mode. If set, yacc generates a +yaccdebug = False # Debugging mode. If set, yacc generates a # a 'parser.out' file in the current directory debug_file = 'parser.out' # Default name of the debugging file -tab_module = 'parsetab' # Default name of the table module -default_lr = 'LALR' # Default LR table generation method - error_count = 3 # Number of symbols that must be shifted to leave recovery mode - -yaccdevel = False # Set to True if developing yacc. This turns off optimized - # implementations of certain functions. - resultlimit = 40 # Size limit of results when running in debug mode. -pickle_protocol = 0 # Protocol to use when writing pickle files - MAXINT = sys.maxsize # This object is a stand-in for a logging object created by the @@ -150,48 +136,6 @@ def format_stack_entry(r): else: return '<%s @ 0x%x>' % (type(r).__name__, id(r)) -# Panic mode error recovery support. This feature is being reworked--much of the -# code here is to offer a deprecation/backwards compatible transition - -_errok = None -_token = None -_restart = None -_warnmsg = '''PLY: Don't use global functions errok(), token(), and restart() in p_error(). -Instead, invoke the methods on the associated parser instance: - - def p_error(p): - ... - # Use parser.errok(), parser.token(), parser.restart() - ... - - parser = yacc.yacc() -''' - -def errok(): - warnings.warn(_warnmsg) - return _errok() - -def restart(): - warnings.warn(_warnmsg) - return _restart() - -def token(): - warnings.warn(_warnmsg) - return _token() - -# Utility function to call the p_error() function with some deprecation hacks -def call_errorfunc(errorfunc, token, parser): - global _errok, _token, _restart - _errok = parser.errok - _token = parser.token - _restart = parser.restart - r = errorfunc(token) - try: - del _errok, _token, _restart - except NameError: - pass - return r - #----------------------------------------------------------------------------- # === LR Parsing Engine === # @@ -218,786 +162,119 @@ def __repr__(self): # This class is a wrapper around the objects actually passed to each # grammar rule. Index lookup and assignment actually assign the -# .value attribute of the underlying YaccSymbol object. -# The lineno() method returns the line number of a given -# item (or 0 if not defined). The linespan() method returns -# a tuple of (startline,endline) representing the range of lines -# for a symbol. The lexspan() method returns a tuple (lexpos,endlexpos) -# representing the range of positional information for a symbol. - -class YaccProduction: - def __init__(self, s, stack=None): - self.slice = s - self.stack = stack - self.lexer = None - self.parser = None - - def __getitem__(self, n): - if isinstance(n, slice): - return [s.value for s in self.slice[n]] - elif n >= 0: - return self.slice[n].value - else: - return self.stack[n].value - - def __setitem__(self, n, v): - self.slice[n].value = v - - def __getslice__(self, i, j): - return [s.value for s in self.slice[i:j]] - - def __len__(self): - return len(self.slice) - - def lineno(self, n): - return getattr(self.slice[n], 'lineno', 0) - - def set_lineno(self, n, lineno): - self.slice[n].lineno = lineno - - def linespan(self, n): - startline = getattr(self.slice[n], 'lineno', 0) - endline = getattr(self.slice[n], 'endlineno', startline) - return startline, endline - - def lexpos(self, n): - return getattr(self.slice[n], 'lexpos', 0) - - def set_lexpos(self, n, lexpos): - self.slice[n].lexpos = lexpos - - def lexspan(self, n): - startpos = getattr(self.slice[n], 'lexpos', 0) - endpos = getattr(self.slice[n], 'endlexpos', startpos) - return startpos, endpos - - def error(self): - raise SyntaxError - -# ----------------------------------------------------------------------------- -# == LRParser == -# -# The LR Parsing engine. -# ----------------------------------------------------------------------------- - -class LRParser: - def __init__(self, lrtab, errorf): - self.productions = lrtab.lr_productions - self.action = lrtab.lr_action - self.goto = lrtab.lr_goto - self.errorfunc = errorf - self.set_defaulted_states() - self.errorok = True - - def errok(self): - self.errorok = True - - def restart(self): - del self.statestack[:] - del self.symstack[:] - sym = YaccSymbol() - sym.type = '$end' - self.symstack.append(sym) - self.statestack.append(0) - - # Defaulted state support. - # This method identifies parser states where there is only one possible reduction action. - # For such states, the parser can make a choose to make a rule reduction without consuming - # the next look-ahead token. This delayed invocation of the tokenizer can be useful in - # certain kinds of advanced parsing situations where the lexer and parser interact with - # each other or change states (i.e., manipulation of scope, lexer states, etc.). - # - # See: http://www.gnu.org/software/bison/manual/html_node/Default-Reductions.html#Default-Reductions - def set_defaulted_states(self): - self.defaulted_states = {} - for state, actions in self.action.items(): - rules = list(actions.values()) - if len(rules) == 1 and rules[0] < 0: - self.defaulted_states[state] = rules[0] - - def disable_defaulted_states(self): - self.defaulted_states = {} - - def parse(self, input=None, lexer=None, debug=False, tracking=False, tokenfunc=None): - if debug or yaccdevel: - if isinstance(debug, int): - debug = PlyLogger(sys.stderr) - return self.parsedebug(input, lexer, debug, tracking, tokenfunc) - elif tracking: - return self.parseopt(input, lexer, debug, tracking, tokenfunc) - else: - return self.parseopt_notrack(input, lexer, debug, tracking, tokenfunc) - - - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - # parsedebug(). - # - # This is the debugging enabled version of parse(). All changes made to the - # parsing engine should be made here. Optimized versions of this function - # are automatically created by the ply/ygen.py script. This script cuts out - # sections enclosed in markers such as this: - # - # #--! DEBUG - # statements - # #--! DEBUG - # - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - def parsedebug(self, input=None, lexer=None, debug=False, tracking=False, tokenfunc=None): - #--! parsedebug-start - lookahead = None # Current lookahead symbol - lookaheadstack = [] # Stack of lookahead symbols - actions = self.action # Local reference to action table (to avoid lookup on self.) - goto = self.goto # Local reference to goto table (to avoid lookup on self.) - prod = self.productions # Local reference to production list (to avoid lookup on self.) - defaulted_states = self.defaulted_states # Local reference to defaulted states - pslice = YaccProduction(None) # Production object passed to grammar rules - errorcount = 0 # Used during error recovery - - #--! DEBUG - debug.info('PLY: PARSE DEBUG START') - #--! DEBUG - - # If no lexer was given, we will try to use the lex module - if not lexer: - from . import lex - lexer = lex.lexer - - # Set up the lexer and parser objects on pslice - pslice.lexer = lexer - pslice.parser = self - - # If input was supplied, pass to lexer - if input is not None: - lexer.input(input) - - if tokenfunc is None: - # Tokenize function - get_token = lexer.token - else: - get_token = tokenfunc - - # Set the parser() token method (sometimes used in error recovery) - self.token = get_token - - # Set up the state and symbol stacks - - statestack = [] # Stack of parsing states - self.statestack = statestack - symstack = [] # Stack of grammar symbols - self.symstack = symstack - - pslice.stack = symstack # Put in the production - errtoken = None # Err token - - # The start state is assumed to be (0,$end) - - statestack.append(0) - sym = YaccSymbol() - sym.type = '$end' - symstack.append(sym) - state = 0 - while True: - # Get the next symbol on the input. If a lookahead symbol - # is already set, we just use that. Otherwise, we'll pull - # the next token off of the lookaheadstack or from the lexer - - #--! DEBUG - debug.debug('') - debug.debug('State : %s', state) - #--! DEBUG - - if state not in defaulted_states: - if not lookahead: - if not lookaheadstack: - lookahead = get_token() # Get the next token - else: - lookahead = lookaheadstack.pop() - if not lookahead: - lookahead = YaccSymbol() - lookahead.type = '$end' - - # Check the action table - ltype = lookahead.type - t = actions[state].get(ltype) - else: - t = defaulted_states[state] - #--! DEBUG - debug.debug('Defaulted state %s: Reduce using %d', state, -t) - #--! DEBUG - - #--! DEBUG - debug.debug('Stack : %s', - ('%s . %s' % (' '.join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()) - #--! DEBUG - - if t is not None: - if t > 0: - # shift a symbol on the stack - statestack.append(t) - state = t - - #--! DEBUG - debug.debug('Action : Shift and goto state %s', t) - #--! DEBUG - - symstack.append(lookahead) - lookahead = None - - # Decrease error count on successful shift - if errorcount: - errorcount -= 1 - continue - - if t < 0: - # reduce a symbol on the stack, emit a production - p = prod[-t] - pname = p.name - plen = p.len - - # Get production function - sym = YaccSymbol() - sym.type = pname # Production name - sym.value = None - - #--! DEBUG - if plen: - debug.info('Action : Reduce rule [%s] with %s and goto state %d', p.str, - '['+','.join([format_stack_entry(_v.value) for _v in symstack[-plen:]])+']', - goto[statestack[-1-plen]][pname]) - else: - debug.info('Action : Reduce rule [%s] with %s and goto state %d', p.str, [], - goto[statestack[-1]][pname]) - - #--! DEBUG - - if plen: - targ = symstack[-plen-1:] - targ[0] = sym - - #--! TRACKING - if tracking: - t1 = targ[1] - sym.lineno = t1.lineno - sym.lexpos = t1.lexpos - t1 = targ[-1] - sym.endlineno = getattr(t1, 'endlineno', t1.lineno) - sym.endlexpos = getattr(t1, 'endlexpos', t1.lexpos) - #--! TRACKING - - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - # The code enclosed in this section is duplicated - # below as a performance optimization. Make sure - # changes get made in both locations. - - pslice.slice = targ - - try: - # Call the grammar rule with our special slice object - del symstack[-plen:] - self.state = state - p.callable(pslice) - del statestack[-plen:] - #--! DEBUG - debug.info('Result : %s', format_result(pslice[0])) - #--! DEBUG - symstack.append(sym) - state = goto[statestack[-1]][pname] - statestack.append(state) - except SyntaxError: - # If an error was set. Enter error recovery state - lookaheadstack.append(lookahead) # Save the current lookahead token - symstack.extend(targ[1:-1]) # Put the production slice back on the stack - statestack.pop() # Pop back one state (before the reduce) - state = statestack[-1] - sym.type = 'error' - sym.value = 'error' - lookahead = sym - errorcount = error_count - self.errorok = False - - continue - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - else: - - #--! TRACKING - if tracking: - sym.lineno = lexer.lineno - sym.lexpos = lexer.lexpos - #--! TRACKING - - targ = [sym] - - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - # The code enclosed in this section is duplicated - # above as a performance optimization. Make sure - # changes get made in both locations. - - pslice.slice = targ - - try: - # Call the grammar rule with our special slice object - self.state = state - p.callable(pslice) - #--! DEBUG - debug.info('Result : %s', format_result(pslice[0])) - #--! DEBUG - symstack.append(sym) - state = goto[statestack[-1]][pname] - statestack.append(state) - except SyntaxError: - # If an error was set. Enter error recovery state - lookaheadstack.append(lookahead) # Save the current lookahead token - statestack.pop() # Pop back one state (before the reduce) - state = statestack[-1] - sym.type = 'error' - sym.value = 'error' - lookahead = sym - errorcount = error_count - self.errorok = False - - continue - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - if t == 0: - n = symstack[-1] - result = getattr(n, 'value', None) - #--! DEBUG - debug.info('Done : Returning %s', format_result(result)) - debug.info('PLY: PARSE DEBUG END') - #--! DEBUG - return result - - if t is None: - - #--! DEBUG - debug.error('Error : %s', - ('%s . %s' % (' '.join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()) - #--! DEBUG - - # We have some kind of parsing error here. To handle - # this, we are going to push the current token onto - # the tokenstack and replace it with an 'error' token. - # If there are any synchronization rules, they may - # catch it. - # - # In addition to pushing the error token, we call call - # the user defined p_error() function if this is the - # first syntax error. This function is only called if - # errorcount == 0. - if errorcount == 0 or self.errorok: - errorcount = error_count - self.errorok = False - errtoken = lookahead - if errtoken.type == '$end': - errtoken = None # End of file! - if self.errorfunc: - if errtoken and not hasattr(errtoken, 'lexer'): - errtoken.lexer = lexer - self.state = state - tok = call_errorfunc(self.errorfunc, errtoken, self) - if self.errorok: - # User must have done some kind of panic - # mode recovery on their own. The - # returned token is the next lookahead - lookahead = tok - errtoken = None - continue - else: - if errtoken: - if hasattr(errtoken, 'lineno'): - lineno = lookahead.lineno - else: - lineno = 0 - if lineno: - sys.stderr.write('yacc: Syntax error at line %d, token=%s\n' % (lineno, errtoken.type)) - else: - sys.stderr.write('yacc: Syntax error, token=%s' % errtoken.type) - else: - sys.stderr.write('yacc: Parse error in input. EOF\n') - return - - else: - errorcount = error_count - - # case 1: the statestack only has 1 entry on it. If we're in this state, the - # entire parse has been rolled back and we're completely hosed. The token is - # discarded and we just keep going. - - if len(statestack) <= 1 and lookahead.type != '$end': - lookahead = None - errtoken = None - state = 0 - # Nuke the pushback stack - del lookaheadstack[:] - continue - - # case 2: the statestack has a couple of entries on it, but we're - # at the end of the file. nuke the top entry and generate an error token - - # Start nuking entries on the stack - if lookahead.type == '$end': - # Whoa. We're really hosed here. Bail out - return - - if lookahead.type != 'error': - sym = symstack[-1] - if sym.type == 'error': - # Hmmm. Error is on top of stack, we'll just nuke input - # symbol and continue - #--! TRACKING - if tracking: - sym.endlineno = getattr(lookahead, 'lineno', sym.lineno) - sym.endlexpos = getattr(lookahead, 'lexpos', sym.lexpos) - #--! TRACKING - lookahead = None - continue - - # Create the error symbol for the first time and make it the new lookahead symbol - t = YaccSymbol() - t.type = 'error' - - if hasattr(lookahead, 'lineno'): - t.lineno = t.endlineno = lookahead.lineno - if hasattr(lookahead, 'lexpos'): - t.lexpos = t.endlexpos = lookahead.lexpos - t.value = lookahead - lookaheadstack.append(lookahead) - lookahead = t - else: - sym = symstack.pop() - #--! TRACKING - if tracking: - lookahead.lineno = sym.lineno - lookahead.lexpos = sym.lexpos - #--! TRACKING - statestack.pop() - state = statestack[-1] - - continue - - # Call an error function here - raise RuntimeError('yacc: internal parser error!!!\n') - - #--! parsedebug-end - - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - # parseopt(). - # - # Optimized version of parse() method. DO NOT EDIT THIS CODE DIRECTLY! - # This code is automatically generated by the ply/ygen.py script. Make - # changes to the parsedebug() method instead. - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - def parseopt(self, input=None, lexer=None, debug=False, tracking=False, tokenfunc=None): - #--! parseopt-start - lookahead = None # Current lookahead symbol - lookaheadstack = [] # Stack of lookahead symbols - actions = self.action # Local reference to action table (to avoid lookup on self.) - goto = self.goto # Local reference to goto table (to avoid lookup on self.) - prod = self.productions # Local reference to production list (to avoid lookup on self.) - defaulted_states = self.defaulted_states # Local reference to defaulted states - pslice = YaccProduction(None) # Production object passed to grammar rules - errorcount = 0 # Used during error recovery - - - # If no lexer was given, we will try to use the lex module - if not lexer: - from . import lex - lexer = lex.lexer - - # Set up the lexer and parser objects on pslice - pslice.lexer = lexer - pslice.parser = self - - # If input was supplied, pass to lexer - if input is not None: - lexer.input(input) - - if tokenfunc is None: - # Tokenize function - get_token = lexer.token - else: - get_token = tokenfunc - - # Set the parser() token method (sometimes used in error recovery) - self.token = get_token - - # Set up the state and symbol stacks - - statestack = [] # Stack of parsing states - self.statestack = statestack - symstack = [] # Stack of grammar symbols - self.symstack = symstack - - pslice.stack = symstack # Put in the production - errtoken = None # Err token - - # The start state is assumed to be (0,$end) - - statestack.append(0) - sym = YaccSymbol() - sym.type = '$end' - symstack.append(sym) - state = 0 - while True: - # Get the next symbol on the input. If a lookahead symbol - # is already set, we just use that. Otherwise, we'll pull - # the next token off of the lookaheadstack or from the lexer - - - if state not in defaulted_states: - if not lookahead: - if not lookaheadstack: - lookahead = get_token() # Get the next token - else: - lookahead = lookaheadstack.pop() - if not lookahead: - lookahead = YaccSymbol() - lookahead.type = '$end' - - # Check the action table - ltype = lookahead.type - t = actions[state].get(ltype) - else: - t = defaulted_states[state] - - - if t is not None: - if t > 0: - # shift a symbol on the stack - statestack.append(t) - state = t - - - symstack.append(lookahead) - lookahead = None - - # Decrease error count on successful shift - if errorcount: - errorcount -= 1 - continue - - if t < 0: - # reduce a symbol on the stack, emit a production - p = prod[-t] - pname = p.name - plen = p.len - - # Get production function - sym = YaccSymbol() - sym.type = pname # Production name - sym.value = None - - - if plen: - targ = symstack[-plen-1:] - targ[0] = sym - - #--! TRACKING - if tracking: - t1 = targ[1] - sym.lineno = t1.lineno - sym.lexpos = t1.lexpos - t1 = targ[-1] - sym.endlineno = getattr(t1, 'endlineno', t1.lineno) - sym.endlexpos = getattr(t1, 'endlexpos', t1.lexpos) - #--! TRACKING - - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - # The code enclosed in this section is duplicated - # below as a performance optimization. Make sure - # changes get made in both locations. - - pslice.slice = targ - - try: - # Call the grammar rule with our special slice object - del symstack[-plen:] - self.state = state - p.callable(pslice) - del statestack[-plen:] - symstack.append(sym) - state = goto[statestack[-1]][pname] - statestack.append(state) - except SyntaxError: - # If an error was set. Enter error recovery state - lookaheadstack.append(lookahead) # Save the current lookahead token - symstack.extend(targ[1:-1]) # Put the production slice back on the stack - statestack.pop() # Pop back one state (before the reduce) - state = statestack[-1] - sym.type = 'error' - sym.value = 'error' - lookahead = sym - errorcount = error_count - self.errorok = False - - continue - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - else: - - #--! TRACKING - if tracking: - sym.lineno = lexer.lineno - sym.lexpos = lexer.lexpos - #--! TRACKING - - targ = [sym] - - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - # The code enclosed in this section is duplicated - # above as a performance optimization. Make sure - # changes get made in both locations. - - pslice.slice = targ - - try: - # Call the grammar rule with our special slice object - self.state = state - p.callable(pslice) - symstack.append(sym) - state = goto[statestack[-1]][pname] - statestack.append(state) - except SyntaxError: - # If an error was set. Enter error recovery state - lookaheadstack.append(lookahead) # Save the current lookahead token - statestack.pop() # Pop back one state (before the reduce) - state = statestack[-1] - sym.type = 'error' - sym.value = 'error' - lookahead = sym - errorcount = error_count - self.errorok = False - - continue - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - if t == 0: - n = symstack[-1] - result = getattr(n, 'value', None) - return result - - if t is None: - - - # We have some kind of parsing error here. To handle - # this, we are going to push the current token onto - # the tokenstack and replace it with an 'error' token. - # If there are any synchronization rules, they may - # catch it. - # - # In addition to pushing the error token, we call call - # the user defined p_error() function if this is the - # first syntax error. This function is only called if - # errorcount == 0. - if errorcount == 0 or self.errorok: - errorcount = error_count - self.errorok = False - errtoken = lookahead - if errtoken.type == '$end': - errtoken = None # End of file! - if self.errorfunc: - if errtoken and not hasattr(errtoken, 'lexer'): - errtoken.lexer = lexer - self.state = state - tok = call_errorfunc(self.errorfunc, errtoken, self) - if self.errorok: - # User must have done some kind of panic - # mode recovery on their own. The - # returned token is the next lookahead - lookahead = tok - errtoken = None - continue - else: - if errtoken: - if hasattr(errtoken, 'lineno'): - lineno = lookahead.lineno - else: - lineno = 0 - if lineno: - sys.stderr.write('yacc: Syntax error at line %d, token=%s\n' % (lineno, errtoken.type)) - else: - sys.stderr.write('yacc: Syntax error, token=%s' % errtoken.type) - else: - sys.stderr.write('yacc: Parse error in input. EOF\n') - return +# .value attribute of the underlying YaccSymbol object. +# The lineno() method returns the line number of a given +# item (or 0 if not defined). The linespan() method returns +# a tuple of (startline,endline) representing the range of lines +# for a symbol. The lexspan() method returns a tuple (lexpos,endlexpos) +# representing the range of positional information for a symbol. - else: - errorcount = error_count +class YaccProduction: + def __init__(self, s, stack=None): + self.slice = s + self.stack = stack + self.lexer = None + self.parser = None - # case 1: the statestack only has 1 entry on it. If we're in this state, the - # entire parse has been rolled back and we're completely hosed. The token is - # discarded and we just keep going. + def __getitem__(self, n): + if isinstance(n, slice): + return [s.value for s in self.slice[n]] + elif n >= 0: + return self.slice[n].value + else: + return self.stack[n].value - if len(statestack) <= 1 and lookahead.type != '$end': - lookahead = None - errtoken = None - state = 0 - # Nuke the pushback stack - del lookaheadstack[:] - continue + def __setitem__(self, n, v): + self.slice[n].value = v - # case 2: the statestack has a couple of entries on it, but we're - # at the end of the file. nuke the top entry and generate an error token + def __getslice__(self, i, j): + return [s.value for s in self.slice[i:j]] - # Start nuking entries on the stack - if lookahead.type == '$end': - # Whoa. We're really hosed here. Bail out - return + def __len__(self): + return len(self.slice) - if lookahead.type != 'error': - sym = symstack[-1] - if sym.type == 'error': - # Hmmm. Error is on top of stack, we'll just nuke input - # symbol and continue - #--! TRACKING - if tracking: - sym.endlineno = getattr(lookahead, 'lineno', sym.lineno) - sym.endlexpos = getattr(lookahead, 'lexpos', sym.lexpos) - #--! TRACKING - lookahead = None - continue + def lineno(self, n): + return getattr(self.slice[n], 'lineno', 0) - # Create the error symbol for the first time and make it the new lookahead symbol - t = YaccSymbol() - t.type = 'error' + def set_lineno(self, n, lineno): + self.slice[n].lineno = lineno - if hasattr(lookahead, 'lineno'): - t.lineno = t.endlineno = lookahead.lineno - if hasattr(lookahead, 'lexpos'): - t.lexpos = t.endlexpos = lookahead.lexpos - t.value = lookahead - lookaheadstack.append(lookahead) - lookahead = t - else: - sym = symstack.pop() - #--! TRACKING - if tracking: - lookahead.lineno = sym.lineno - lookahead.lexpos = sym.lexpos - #--! TRACKING - statestack.pop() - state = statestack[-1] + def linespan(self, n): + startline = getattr(self.slice[n], 'lineno', 0) + endline = getattr(self.slice[n], 'endlineno', startline) + return startline, endline - continue + def lexpos(self, n): + return getattr(self.slice[n], 'lexpos', 0) - # Call an error function here - raise RuntimeError('yacc: internal parser error!!!\n') + def set_lexpos(self, n, lexpos): + self.slice[n].lexpos = lexpos + + def lexspan(self, n): + startpos = getattr(self.slice[n], 'lexpos', 0) + endpos = getattr(self.slice[n], 'endlexpos', startpos) + return startpos, endpos + + def error(self): + raise SyntaxError + +# ----------------------------------------------------------------------------- +# == LRParser == +# +# The LR Parsing engine. +# ----------------------------------------------------------------------------- + +class LRParser: + def __init__(self, lrtab, errorf): + self.productions = lrtab.lr_productions + self.action = lrtab.lr_action + self.goto = lrtab.lr_goto + self.errorfunc = errorf + self.set_defaulted_states() + self.errorok = True + + def errok(self): + self.errorok = True + + def restart(self): + del self.statestack[:] + del self.symstack[:] + sym = YaccSymbol() + sym.type = '$end' + self.symstack.append(sym) + self.statestack.append(0) + + # Defaulted state support. + # This method identifies parser states where there is only one possible reduction action. + # For such states, the parser can make a choose to make a rule reduction without consuming + # the next look-ahead token. This delayed invocation of the tokenizer can be useful in + # certain kinds of advanced parsing situations where the lexer and parser interact with + # each other or change states (i.e., manipulation of scope, lexer states, etc.). + # + # See: http://www.gnu.org/software/bison/manual/html_node/Default-Reductions.html#Default-Reductions + def set_defaulted_states(self): + self.defaulted_states = {} + for state, actions in self.action.items(): + rules = list(actions.values()) + if len(rules) == 1 and rules[0] < 0: + self.defaulted_states[state] = rules[0] - #--! parseopt-end + def disable_defaulted_states(self): + self.defaulted_states = {} - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - # parseopt_notrack(). + # parse(). # - # Optimized version of parseopt() with line number tracking removed. - # DO NOT EDIT THIS CODE DIRECTLY. This code is automatically generated - # by the ply/ygen.py script. Make changes to the parsedebug() method instead. - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # This is the core parsing engine. To operate, it requires a lexer object. + # Two options are provided. The debug flag turns on debugging so that you can + # see the various rule reductions and parsing steps. tracking turns on position + # tracking. In this mode, symbols will record the starting/ending line number and + # character index. + + def parse(self, input=None, lexer=None, debug=False, tracking=False): + # If debugging has been specified as a flag, turn it into a logging object + if isinstance(debug, int) and debug: + debug = PlyLogger(sys.stderr) - def parseopt_notrack(self, input=None, lexer=None, debug=False, tracking=False, tokenfunc=None): - #--! parseopt-notrack-start lookahead = None # Current lookahead symbol lookaheadstack = [] # Stack of lookahead symbols actions = self.action # Local reference to action table (to avoid lookup on self.) @@ -1007,6 +284,8 @@ def parseopt_notrack(self, input=None, lexer=None, debug=False, tracking=False, pslice = YaccProduction(None) # Production object passed to grammar rules errorcount = 0 # Used during error recovery + if debug: + debug.info('PLY: PARSE DEBUG START') # If no lexer was given, we will try to use the lex module if not lexer: @@ -1021,24 +300,14 @@ def parseopt_notrack(self, input=None, lexer=None, debug=False, tracking=False, if input is not None: lexer.input(input) - if tokenfunc is None: - # Tokenize function - get_token = lexer.token - else: - get_token = tokenfunc - - # Set the parser() token method (sometimes used in error recovery) - self.token = get_token + # Set the token function + get_token = self.token = lexer.token # Set up the state and symbol stacks - - statestack = [] # Stack of parsing states - self.statestack = statestack - symstack = [] # Stack of grammar symbols - self.symstack = symstack - - pslice.stack = symstack # Put in the production - errtoken = None # Err token + statestack = self.statestack = [] # Stack of parsing states + symstack = self.symstack = [] # Stack of grammar symbols + pslice.stack = symstack # Put in the production + errtoken = None # Err token # The start state is assumed to be (0,$end) @@ -1052,6 +321,8 @@ def parseopt_notrack(self, input=None, lexer=None, debug=False, tracking=False, # is already set, we just use that. Otherwise, we'll pull # the next token off of the lookaheadstack or from the lexer + if debug: + debug.debug('State : %s', state) if state not in defaulted_states: if not lookahead: @@ -1068,7 +339,12 @@ def parseopt_notrack(self, input=None, lexer=None, debug=False, tracking=False, t = actions[state].get(ltype) else: t = defaulted_states[state] + if debug: + debug.debug('Defaulted state %s: Reduce using %d', state, -t) + if debug: + debug.debug('Stack : %s', + ('%s . %s' % (' '.join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()) if t is not None: if t > 0: @@ -1076,6 +352,8 @@ def parseopt_notrack(self, input=None, lexer=None, debug=False, tracking=False, statestack.append(t) state = t + if debug: + debug.debug('Action : Shift and goto state %s', t) symstack.append(lookahead) lookahead = None @@ -1096,11 +374,26 @@ def parseopt_notrack(self, input=None, lexer=None, debug=False, tracking=False, sym.type = pname # Production name sym.value = None + if debug: + if plen: + debug.info('Action : Reduce rule [%s] with %s and goto state %d', p.str, + '['+','.join([format_stack_entry(_v.value) for _v in symstack[-plen:]])+']', + goto[statestack[-1-plen]][pname]) + else: + debug.info('Action : Reduce rule [%s] with %s and goto state %d', p.str, [], + goto[statestack[-1]][pname]) if plen: targ = symstack[-plen-1:] targ[0] = sym + if tracking: + t1 = targ[1] + sym.lineno = t1.lineno + sym.lexpos = t1.lexpos + t1 = targ[-1] + sym.endlineno = getattr(t1, 'endlineno', t1.lineno) + sym.endlexpos = getattr(t1, 'endlexpos', t1.lexpos) # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # The code enclosed in this section is duplicated @@ -1115,6 +408,8 @@ def parseopt_notrack(self, input=None, lexer=None, debug=False, tracking=False, self.state = state p.callable(pslice) del statestack[-plen:] + if debug: + debug.info('Result : %s', format_result(pslice[0])) symstack.append(sym) state = goto[statestack[-1]][pname] statestack.append(state) @@ -1131,10 +426,12 @@ def parseopt_notrack(self, input=None, lexer=None, debug=False, tracking=False, self.errorok = False continue - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! else: + if tracking: + sym.lineno = lexer.lineno + sym.lexpos = lexer.lexpos targ = [sym] @@ -1149,6 +446,8 @@ def parseopt_notrack(self, input=None, lexer=None, debug=False, tracking=False, # Call the grammar rule with our special slice object self.state = state p.callable(pslice) + if debug: + debug.info('Result : %s', format_result(pslice[0])) symstack.append(sym) state = goto[statestack[-1]][pname] statestack.append(state) @@ -1164,15 +463,22 @@ def parseopt_notrack(self, input=None, lexer=None, debug=False, tracking=False, self.errorok = False continue - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! if t == 0: n = symstack[-1] result = getattr(n, 'value', None) + + if debug: + debug.info('Done : Returning %s', format_result(result)) + debug.info('PLY: PARSE DEBUG END') + return result if t is None: + if debug: + debug.error('Error : %s', + ('%s . %s' % (' '.join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()) # We have some kind of parsing error here. To handle # this, we are going to push the current token onto @@ -1194,7 +500,7 @@ def parseopt_notrack(self, input=None, lexer=None, debug=False, tracking=False, if errtoken and not hasattr(errtoken, 'lexer'): errtoken.lexer = lexer self.state = state - tok = call_errorfunc(self.errorfunc, errtoken, self) + tok = self.errorfunc(errtoken) if self.errorok: # User must have done some kind of panic # mode recovery on their own. The @@ -1244,6 +550,9 @@ def parseopt_notrack(self, input=None, lexer=None, debug=False, tracking=False, if sym.type == 'error': # Hmmm. Error is on top of stack, we'll just nuke input # symbol and continue + if tracking: + sym.endlineno = getattr(lookahead, 'lineno', sym.lineno) + sym.endlexpos = getattr(lookahead, 'lexpos', sym.lexpos) lookahead = None continue @@ -1260,16 +569,17 @@ def parseopt_notrack(self, input=None, lexer=None, debug=False, tracking=False, lookahead = t else: sym = symstack.pop() + if tracking: + lookahead.lineno = sym.lineno + lookahead.lexpos = sym.lexpos statestack.pop() state = statestack[-1] continue - # Call an error function here + # If we'r here, something really bad happened raise RuntimeError('yacc: internal parser error!!!\n') - #--! parseopt-notrack-end - # ----------------------------------------------------------------------------- # === Grammar Representation === # @@ -1372,32 +682,6 @@ def bind(self, pdict): if self.func: self.callable = pdict[self.func] -# This class serves as a minimal standin for Production objects when -# reading table data from files. It only contains information -# actually used by the LR parsing engine, plus some additional -# debugging information. -class MiniProduction(object): - def __init__(self, str, name, len, func, file, line): - self.name = name - self.len = len - self.func = func - self.callable = None - self.file = file - self.line = line - self.str = str - - def __str__(self): - return self.str - - def __repr__(self): - return 'MiniProduction(%s)' % self.str - - # Bind the production function name to a callable - def bind(self, pdict): - if self.func: - self.callable = pdict[self.func] - - # ----------------------------------------------------------------------------- # class LRItem # @@ -1955,77 +1239,6 @@ def build_lritems(self): i += 1 p.lr_items = lr_items -# ----------------------------------------------------------------------------- -# == Class LRTable == -# -# This basic class represents a basic table of LR parsing information. -# Methods for generating the tables are not defined here. They are defined -# in the derived class LRGeneratedTable. -# ----------------------------------------------------------------------------- - -class VersionError(YaccError): - pass - -class LRTable(object): - def __init__(self): - self.lr_action = None - self.lr_goto = None - self.lr_productions = None - self.lr_method = None - - def read_table(self, module): - if isinstance(module, types.ModuleType): - parsetab = module - else: - exec('import %s' % module) - parsetab = sys.modules[module] - - if parsetab._tabversion != __tabversion__: - raise VersionError('yacc table file version is out of date') - - self.lr_action = parsetab._lr_action - self.lr_goto = parsetab._lr_goto - - self.lr_productions = [] - for p in parsetab._lr_productions: - self.lr_productions.append(MiniProduction(*p)) - - self.lr_method = parsetab._lr_method - return parsetab._lr_signature - - def read_pickle(self, filename): - try: - import cPickle as pickle - except ImportError: - import pickle - - if not os.path.exists(filename): - raise ImportError - - in_f = open(filename, 'rb') - - tabversion = pickle.load(in_f) - if tabversion != __tabversion__: - raise VersionError('yacc table file version is out of date') - self.lr_method = pickle.load(in_f) - signature = pickle.load(in_f) - self.lr_action = pickle.load(in_f) - self.lr_goto = pickle.load(in_f) - productions = pickle.load(in_f) - - self.lr_productions = [] - for p in productions: - self.lr_productions.append(MiniProduction(*p)) - - in_f.close() - return signature - - # Bind all production function names to callable objects in pdict - def bind_callables(self, pdict): - for p in self.lr_productions: - p.bind(pdict) - - # ----------------------------------------------------------------------------- # === LR Generator === # @@ -2087,20 +1300,17 @@ def traverse(x, N, stack, F, X, R, FP): class LALRError(YaccError): pass + # ----------------------------------------------------------------------------- -# == LRGeneratedTable == +# == LRTable == # # This class implements the LR table generation algorithm. There are no -# public methods except for write() +# public methods. # ----------------------------------------------------------------------------- -class LRGeneratedTable(LRTable): - def __init__(self, grammar, method='LALR', log=None): - if method not in ['SLR', 'LALR']: - raise LALRError('Unsupported method %s' % method) - +class LRTable: + def __init__(self, grammar, log=None): self.grammar = grammar - self.lr_method = method # Set up the logger if not log: @@ -2130,6 +1340,11 @@ def __init__(self, grammar, method='LALR', log=None): self.grammar.compute_follow() self.lr_parse_table() + # Bind all production function names to callable objects in pdict + def bind_callables(self, pdict): + for p in self.lr_productions: + p.bind(pdict) + # Compute the LR(0) closure operation on I, where I is a set of LR(0) items. def lr0_closure(self, I): @@ -2536,15 +1751,11 @@ def lr_parse_table(self): actionp = {} # Action production array (temporary) - log.info('Parsing method: %s', self.lr_method) - # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items # This determines the number of states C = self.lr0_items() - - if self.lr_method == 'LALR': - self.add_lalr_lookaheads(C) + self.add_lalr_lookaheads(C) # Build the parser table, state by state st = 0 @@ -2569,10 +1780,7 @@ def lr_parse_table(self): st_actionp['$end'] = p else: # We are at the end of a production. Reduce! - if self.lr_method == 'LALR': - laheads = p.lookaheads[st] - else: - laheads = self.grammar.Follow[p.name] + laheads = p.lookaheads[st] for a in laheads: actlist.append((a, p, 'reduce using rule %d (%s)' % (p.number, p))) r = st_action.get(a) @@ -2714,155 +1922,6 @@ def lr_parse_table(self): goto[st] = st_goto st += 1 - # ----------------------------------------------------------------------------- - # write() - # - # This function writes the LR parsing tables to a file - # ----------------------------------------------------------------------------- - - def write_table(self, tabmodule, outputdir='', signature=''): - if isinstance(tabmodule, types.ModuleType): - raise IOError("Won't overwrite existing tabmodule") - - basemodulename = tabmodule.split('.')[-1] - filename = os.path.join(outputdir, basemodulename) + '.py' - try: - f = open(filename, 'w') - - f.write(''' -# %s -# This file is automatically generated. Do not edit. -# pylint: disable=W,C,R -_tabversion = %r - -_lr_method = %r - -_lr_signature = %r - ''' % (os.path.basename(filename), __tabversion__, self.lr_method, signature)) - - # Change smaller to 0 to go back to original tables - smaller = 1 - - # Factor out names to try and make smaller - if smaller: - items = {} - - for s, nd in self.lr_action.items(): - for name, v in nd.items(): - i = items.get(name) - if not i: - i = ([], []) - items[name] = i - i[0].append(s) - i[1].append(v) - - f.write('\n_lr_action_items = {') - for k, v in items.items(): - f.write('%r:([' % k) - for i in v[0]: - f.write('%r,' % i) - f.write('],[') - for i in v[1]: - f.write('%r,' % i) - - f.write(']),') - f.write('}\n') - - f.write(''' -_lr_action = {} -for _k, _v in _lr_action_items.items(): - for _x,_y in zip(_v[0],_v[1]): - if not _x in _lr_action: _lr_action[_x] = {} - _lr_action[_x][_k] = _y -del _lr_action_items -''') - - else: - f.write('\n_lr_action = { ') - for k, v in self.lr_action.items(): - f.write('(%r,%r):%r,' % (k[0], k[1], v)) - f.write('}\n') - - if smaller: - # Factor out names to try and make smaller - items = {} - - for s, nd in self.lr_goto.items(): - for name, v in nd.items(): - i = items.get(name) - if not i: - i = ([], []) - items[name] = i - i[0].append(s) - i[1].append(v) - - f.write('\n_lr_goto_items = {') - for k, v in items.items(): - f.write('%r:([' % k) - for i in v[0]: - f.write('%r,' % i) - f.write('],[') - for i in v[1]: - f.write('%r,' % i) - - f.write(']),') - f.write('}\n') - - f.write(''' -_lr_goto = {} -for _k, _v in _lr_goto_items.items(): - for _x, _y in zip(_v[0], _v[1]): - if not _x in _lr_goto: _lr_goto[_x] = {} - _lr_goto[_x][_k] = _y -del _lr_goto_items -''') - else: - f.write('\n_lr_goto = { ') - for k, v in self.lr_goto.items(): - f.write('(%r,%r):%r,' % (k[0], k[1], v)) - f.write('}\n') - - # Write production table - f.write('_lr_productions = [\n') - for p in self.lr_productions: - if p.func: - f.write(' (%r,%r,%d,%r,%r,%d),\n' % (p.str, p.name, p.len, - p.func, os.path.basename(p.file), p.line)) - else: - f.write(' (%r,%r,%d,None,None,None),\n' % (str(p), p.name, p.len)) - f.write(']\n') - f.close() - - except IOError as e: - raise - - - # ----------------------------------------------------------------------------- - # pickle_table() - # - # This function pickles the LR parsing tables to a supplied file object - # ----------------------------------------------------------------------------- - - def pickle_table(self, filename, signature=''): - try: - import cPickle as pickle - except ImportError: - import pickle - with open(filename, 'wb') as outf: - pickle.dump(__tabversion__, outf, pickle_protocol) - pickle.dump(self.lr_method, outf, pickle_protocol) - pickle.dump(signature, outf, pickle_protocol) - pickle.dump(self.lr_action, outf, pickle_protocol) - pickle.dump(self.lr_goto, outf, pickle_protocol) - - outp = [] - for p in self.lr_productions: - if p.func: - outp.append((p.str, p.name, p.len, p.func, os.path.basename(p.file), p.line)) - else: - outp.append((str(p), p.name, p.len, None, None, None)) - pickle.dump(outp, outf, pickle_protocol) - # ----------------------------------------------------------------------------- # === INTROSPECTION === # @@ -3209,20 +2268,13 @@ def validate_pfunctions(self): # Build a parser # ----------------------------------------------------------------------------- -def yacc(method='LALR', debug=yaccdebug, module=None, tabmodule=tab_module, start=None, - check_recursion=True, optimize=False, write_tables=True, debugfile=debug_file, - outputdir=None, debuglog=None, errorlog=None, picklefile=None): - - if tabmodule is None: - tabmodule = tab_module +def yacc(*, debug=yaccdebug, module=None, start=None, + check_recursion=True, optimize=False, debugfile=debug_file, + debuglog=None, errorlog=None): # Reference to the parsing method of the last built parser global parse - # If pickling is enabled, table files are not created - if picklefile: - write_tables = 0 - if errorlog is None: errorlog = PlyLogger(sys.stderr) @@ -3240,32 +2292,6 @@ def yacc(method='LALR', debug=yaccdebug, module=None, tabmodule=tab_module, star else: pdict = get_caller_module_dict(2) - if outputdir is None: - # If no output directory is set, the location of the output files - # is determined according to the following rules: - # - If tabmodule specifies a package, files go into that package directory - # - Otherwise, files go in the same directory as the specifying module - if isinstance(tabmodule, types.ModuleType): - srcfile = tabmodule.__file__ - else: - if '.' not in tabmodule: - srcfile = pdict['__file__'] - else: - parts = tabmodule.split('.') - pkgname = '.'.join(parts[:-1]) - exec('import %s' % pkgname) - srcfile = getattr(sys.modules[pkgname], '__file__', '') - outputdir = os.path.dirname(srcfile) - - # Determine if the module is package of a package or not. - # If so, fix the tabmodule setting so that tables load correctly - pkg = pdict.get('__package__') - if pkg and isinstance(tabmodule, str): - if '.' not in tabmodule: - tabmodule = pkg + '.' + tabmodule - - - # Set start symbol if it's specified directly using an argument if start is not None: pdict['start'] = start @@ -3277,40 +2303,17 @@ def yacc(method='LALR', debug=yaccdebug, module=None, tabmodule=tab_module, star if pinfo.error: raise YaccError('Unable to build parser') - # Check signature against table files (if any) - signature = pinfo.signature() - - # Read the tables - try: - lr = LRTable() - if picklefile: - read_signature = lr.read_pickle(picklefile) - else: - read_signature = lr.read_table(tabmodule) - if optimize or (read_signature == signature): - try: - lr.bind_callables(pinfo.pdict) - parser = LRParser(lr, pinfo.error_func) - parse = parser.parse - return parser - except Exception as e: - errorlog.warning('There was a problem loading the table file: %r', e) - except VersionError as e: - errorlog.warning(str(e)) - except ImportError: - pass - if debuglog is None: if debug: try: - debuglog = PlyLogger(open(os.path.join(outputdir, debugfile), 'w')) + debuglog = PlyLogger(open(debugfile, 'w')) except IOError as e: errorlog.warning("Couldn't open %r. %s" % (debugfile, e)) debuglog = NullLogger() else: debuglog = NullLogger() - debuglog.info('Created by PLY version %s (http://www.dabeaz.com/ply)', __version__) + debuglog.info('Created by PLY (http://www.dabeaz.com/ply)') errors = False @@ -3427,11 +2430,8 @@ def yacc(method='LALR', debug=yaccdebug, module=None, tabmodule=tab_module, star if errors: raise YaccError('Unable to build parser') - # Run the LRGeneratedTable on the grammar - if debug: - errorlog.debug('Generating %s tables', method) - - lr = LRGeneratedTable(grammar, method, debuglog) + # Run the LRTable on the grammar + lr = LRTable(grammar, debuglog) if debug: num_sr = len(lr.sr_conflicts) @@ -3474,22 +2474,6 @@ def yacc(method='LALR', debug=yaccdebug, module=None, tabmodule=tab_module, star errorlog.warning('Rule (%s) is never reduced', rejected) warned_never.append(rejected) - # Write the table file if requested - if write_tables: - try: - lr.write_table(tabmodule, outputdir, signature) - if tabmodule in sys.modules: - del sys.modules[tabmodule] - except IOError as e: - errorlog.warning("Couldn't create %r. %s" % (tabmodule, e)) - - # Write a pickled version of the tables - if picklefile: - try: - lr.pickle_table(picklefile, signature) - except IOError as e: - errorlog.warning("Couldn't create %r. %s" % (picklefile, e)) - # Build the parser lr.bind_callables(pinfo.pdict) parser = LRParser(lr, pinfo.error_func) diff --git a/ply/ygen.py b/ply/ygen.py deleted file mode 100644 index 03b9318..0000000 --- a/ply/ygen.py +++ /dev/null @@ -1,69 +0,0 @@ -# ply: ygen.py -# -# This is a support program that auto-generates different versions of the YACC parsing -# function with different features removed for the purposes of performance. -# -# Users should edit the method LRParser.parsedebug() in yacc.py. The source code -# for that method is then used to create the other methods. See the comments in -# yacc.py for further details. - -import os.path -import shutil - -def get_source_range(lines, tag): - srclines = enumerate(lines) - start_tag = '#--! %s-start' % tag - end_tag = '#--! %s-end' % tag - - for start_index, line in srclines: - if line.strip().startswith(start_tag): - break - - for end_index, line in srclines: - if line.strip().endswith(end_tag): - break - - return (start_index + 1, end_index) - -def filter_section(lines, tag): - filtered_lines = [] - include = True - tag_text = '#--! %s' % tag - for line in lines: - if line.strip().startswith(tag_text): - include = not include - elif include: - filtered_lines.append(line) - return filtered_lines - -def main(): - dirname = os.path.dirname(__file__) - shutil.copy2(os.path.join(dirname, 'yacc.py'), os.path.join(dirname, 'yacc.py.bak')) - with open(os.path.join(dirname, 'yacc.py'), 'r') as f: - lines = f.readlines() - - parse_start, parse_end = get_source_range(lines, 'parsedebug') - parseopt_start, parseopt_end = get_source_range(lines, 'parseopt') - parseopt_notrack_start, parseopt_notrack_end = get_source_range(lines, 'parseopt-notrack') - - # Get the original source - orig_lines = lines[parse_start:parse_end] - - # Filter the DEBUG sections out - parseopt_lines = filter_section(orig_lines, 'DEBUG') - - # Filter the TRACKING sections out - parseopt_notrack_lines = filter_section(parseopt_lines, 'TRACKING') - - # Replace the parser source sections with updated versions - lines[parseopt_notrack_start:parseopt_notrack_end] = parseopt_notrack_lines - lines[parseopt_start:parseopt_end] = parseopt_lines - - lines = [line.rstrip()+'\n' for line in lines] - with open(os.path.join(dirname, 'yacc.py'), 'w') as f: - f.writelines(lines) - - print('Updated yacc.py') - -if __name__ == '__main__': - main() diff --git a/setup.md b/setup.md index 7c3cb50..3b7508c 100644 --- a/setup.md +++ b/setup.md @@ -34,17 +34,3 @@ literally no reason to ever upgrade it. Keep using the version of code that you copied. If you think you've found a bug, check back with the repository to see if it's been fixed. Or submit it as an issue so that it can be looked at. - - - - - - - - - - - - - - diff --git a/test/lex_many_tokens.py b/test/lex_many_tokens.py index 77ae12b..81ae57a 100644 --- a/test/lex_many_tokens.py +++ b/test/lex_many_tokens.py @@ -21,7 +21,7 @@ def t_error(t): pass -lex.lex(optimize=1,lextab="manytab") +lex.lex() lex.runmain(data="TOK34: TOK143: TOK269: TOK372: TOK452: TOK561: TOK999:") diff --git a/test/lex_opt_alias.py b/test/lex_opt_alias.py deleted file mode 100644 index 5d5ed4c..0000000 --- a/test/lex_opt_alias.py +++ /dev/null @@ -1,54 +0,0 @@ -# ----------------------------------------------------------------------------- -# lex_opt_alias.py -# -# Tests ability to match up functions with states, aliases, and -# lexing tables. -# ----------------------------------------------------------------------------- - -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - -tokens = ( - 'NAME','NUMBER', - ) - -states = (('instdef','inclusive'),('spam','exclusive')) - -literals = ['=','+','-','*','/', '(',')'] - -# Tokens - -def t_instdef_spam_BITS(t): - r'[01-]+' - return t - -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ANY_NUMBER = NUMBER - -t_ignore = " \t" -t_spam_ignore = t_ignore - -def t_newline(t): - r'\n+' - t.lexer.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -t_spam_error = t_error - -# Build the lexer -import ply.lex as lex -lex.lex(optimize=1,lextab="aliastab") -lex.runmain(data="3+4") diff --git a/test/lex_optimize.py b/test/lex_optimize.py deleted file mode 100644 index 0e447e6..0000000 --- a/test/lex_optimize.py +++ /dev/null @@ -1,50 +0,0 @@ -# ----------------------------------------------------------------------------- -# lex_optimize.py -# ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") -import ply.lex as lex - -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - -def t_newline(t): - r'\n+' - t.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -lex.lex(optimize=1) -lex.runmain(data="3+4") - - - diff --git a/test/lex_optimize2.py b/test/lex_optimize2.py deleted file mode 100644 index 64555f6..0000000 --- a/test/lex_optimize2.py +++ /dev/null @@ -1,50 +0,0 @@ -# ----------------------------------------------------------------------------- -# lex_optimize2.py -# ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") -import ply.lex as lex - -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - -def t_newline(t): - r'\n+' - t.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -lex.lex(optimize=1,lextab="opt2tab") -lex.runmain(data="3+4") - - - diff --git a/test/lex_optimize3.py b/test/lex_optimize3.py deleted file mode 100644 index b8df5aa..0000000 --- a/test/lex_optimize3.py +++ /dev/null @@ -1,52 +0,0 @@ -# ----------------------------------------------------------------------------- -# lex_optimize3.py -# -# Writes table in a subdirectory structure. -# ----------------------------------------------------------------------------- -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") -import ply.lex as lex - -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - -def t_newline(t): - r'\n+' - t.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -lex.lex(optimize=1,lextab="lexdir.sub.calctab" ,outputdir="lexdir/sub") -lex.runmain(data="3+4") - - - diff --git a/test/lex_optimize4.py b/test/lex_optimize4.py deleted file mode 100644 index cc6e2a9..0000000 --- a/test/lex_optimize4.py +++ /dev/null @@ -1,26 +0,0 @@ -# ----------------------------------------------------------------------------- -# lex_optimize4.py -# ----------------------------------------------------------------------------- -import re -import sys - -if ".." not in sys.path: sys.path.insert(0,"..") -import ply.lex as lex - -tokens = [ - "PLUS", - "MINUS", - "NUMBER", - ] - -t_PLUS = r'\+?' -t_MINUS = r'-' -t_NUMBER = r'(\d+)' - -def t_error(t): - pass - - -# Build the lexer -lex.lex(optimize=True, lextab="opt4tab", reflags=re.UNICODE) -lex.runmain(data="3+4") diff --git a/test/lex_token5.py b/test/lex_token5.py deleted file mode 100644 index ef7a3c5..0000000 --- a/test/lex_token5.py +++ /dev/null @@ -1,31 +0,0 @@ -# lex_token5.py -# -# Return a bad token name - -import sys -if ".." not in sys.path: sys.path.insert(0,"..") - -import ply.lex as lex - -tokens = [ - "PLUS", - "MINUS", - "NUMBER", - ] - -t_PLUS = r'\+' -t_MINUS = r'-' - -def t_NUMBER(t): - r'\d+' - t.type = "NUM" - return t - -def t_error(t): - pass - -lex.lex() -lex.input("1234") -t = lex.token() - - diff --git a/test/pkg_test1/__init__.py b/test/pkg_test1/__init__.py deleted file mode 100644 index 0e19558..0000000 --- a/test/pkg_test1/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Tests proper handling of lextab and parsetab files in package structures - -# Here for testing purposes -import sys -if '..' not in sys.path: - sys.path.insert(0, '..') - -from .parsing.calcparse import parser - diff --git a/test/pkg_test1/parsing/__init__.py b/test/pkg_test1/parsing/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/test/pkg_test1/parsing/calclex.py b/test/pkg_test1/parsing/calclex.py deleted file mode 100644 index b3c1a4d..0000000 --- a/test/pkg_test1/parsing/calclex.py +++ /dev/null @@ -1,47 +0,0 @@ -# ----------------------------------------------------------------------------- -# calclex.py -# ----------------------------------------------------------------------------- - -import ply.lex as lex - -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - -def t_newline(t): - r'\n+' - t.lexer.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -lexer = lex.lex(optimize=True) - - - diff --git a/test/pkg_test1/parsing/calcparse.py b/test/pkg_test1/parsing/calcparse.py deleted file mode 100644 index c058e9f..0000000 --- a/test/pkg_test1/parsing/calcparse.py +++ /dev/null @@ -1,66 +0,0 @@ -# ----------------------------------------------------------------------------- -# yacc_simple.py -# -# A simple, properly specifier grammar -# ----------------------------------------------------------------------------- - -from .calclex import tokens -from ply import yacc - -# Parsing rules -precedence = ( - ('left','PLUS','MINUS'), - ('left','TIMES','DIVIDE'), - ('right','UMINUS'), - ) - -# dictionary of names -names = { } - -def p_statement_assign(t): - 'statement : NAME EQUALS expression' - names[t[1]] = t[3] - -def p_statement_expr(t): - 'statement : expression' - t[0] = t[1] - -def p_expression_binop(t): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - if t[2] == '+' : t[0] = t[1] + t[3] - elif t[2] == '-': t[0] = t[1] - t[3] - elif t[2] == '*': t[0] = t[1] * t[3] - elif t[2] == '/': t[0] = t[1] / t[3] - -def p_expression_uminus(t): - 'expression : MINUS expression %prec UMINUS' - t[0] = -t[2] - -def p_expression_group(t): - 'expression : LPAREN expression RPAREN' - t[0] = t[2] - -def p_expression_number(t): - 'expression : NUMBER' - t[0] = t[1] - -def p_expression_name(t): - 'expression : NAME' - try: - t[0] = names[t[1]] - except LookupError: - print("Undefined name '%s'" % t[1]) - t[0] = 0 - -def p_error(t): - print("Syntax error at '%s'" % t.value) - -parser = yacc.yacc() - - - - - diff --git a/test/pkg_test2/__init__.py b/test/pkg_test2/__init__.py deleted file mode 100644 index 0e19558..0000000 --- a/test/pkg_test2/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Tests proper handling of lextab and parsetab files in package structures - -# Here for testing purposes -import sys -if '..' not in sys.path: - sys.path.insert(0, '..') - -from .parsing.calcparse import parser - diff --git a/test/pkg_test2/parsing/__init__.py b/test/pkg_test2/parsing/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/test/pkg_test2/parsing/calclex.py b/test/pkg_test2/parsing/calclex.py deleted file mode 100644 index 789e13f..0000000 --- a/test/pkg_test2/parsing/calclex.py +++ /dev/null @@ -1,47 +0,0 @@ -# ----------------------------------------------------------------------------- -# calclex.py -# ----------------------------------------------------------------------------- - -import ply.lex as lex - -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - -def t_newline(t): - r'\n+' - t.lexer.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -lexer = lex.lex(optimize=True, lextab='calclextab') - - - diff --git a/test/pkg_test2/parsing/calcparse.py b/test/pkg_test2/parsing/calcparse.py deleted file mode 100644 index f519338..0000000 --- a/test/pkg_test2/parsing/calcparse.py +++ /dev/null @@ -1,66 +0,0 @@ -# ----------------------------------------------------------------------------- -# yacc_simple.py -# -# A simple, properly specifier grammar -# ----------------------------------------------------------------------------- - -from .calclex import tokens -from ply import yacc - -# Parsing rules -precedence = ( - ('left','PLUS','MINUS'), - ('left','TIMES','DIVIDE'), - ('right','UMINUS'), - ) - -# dictionary of names -names = { } - -def p_statement_assign(t): - 'statement : NAME EQUALS expression' - names[t[1]] = t[3] - -def p_statement_expr(t): - 'statement : expression' - t[0] = t[1] - -def p_expression_binop(t): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - if t[2] == '+' : t[0] = t[1] + t[3] - elif t[2] == '-': t[0] = t[1] - t[3] - elif t[2] == '*': t[0] = t[1] * t[3] - elif t[2] == '/': t[0] = t[1] / t[3] - -def p_expression_uminus(t): - 'expression : MINUS expression %prec UMINUS' - t[0] = -t[2] - -def p_expression_group(t): - 'expression : LPAREN expression RPAREN' - t[0] = t[2] - -def p_expression_number(t): - 'expression : NUMBER' - t[0] = t[1] - -def p_expression_name(t): - 'expression : NAME' - try: - t[0] = names[t[1]] - except LookupError: - print("Undefined name '%s'" % t[1]) - t[0] = 0 - -def p_error(t): - print("Syntax error at '%s'" % t.value) - -parser = yacc.yacc(tabmodule='calcparsetab') - - - - - diff --git a/test/pkg_test3/__init__.py b/test/pkg_test3/__init__.py deleted file mode 100644 index 0e19558..0000000 --- a/test/pkg_test3/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Tests proper handling of lextab and parsetab files in package structures - -# Here for testing purposes -import sys -if '..' not in sys.path: - sys.path.insert(0, '..') - -from .parsing.calcparse import parser - diff --git a/test/pkg_test3/generated/__init__.py b/test/pkg_test3/generated/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/test/pkg_test3/parsing/__init__.py b/test/pkg_test3/parsing/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/test/pkg_test3/parsing/calclex.py b/test/pkg_test3/parsing/calclex.py deleted file mode 100644 index 6ca2c4f..0000000 --- a/test/pkg_test3/parsing/calclex.py +++ /dev/null @@ -1,47 +0,0 @@ -# ----------------------------------------------------------------------------- -# calclex.py -# ----------------------------------------------------------------------------- - -import ply.lex as lex - -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - -def t_newline(t): - r'\n+' - t.lexer.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -lexer = lex.lex(optimize=True, lextab='pkg_test3.generated.lextab') - - - diff --git a/test/pkg_test3/parsing/calcparse.py b/test/pkg_test3/parsing/calcparse.py deleted file mode 100644 index 2dcb52b..0000000 --- a/test/pkg_test3/parsing/calcparse.py +++ /dev/null @@ -1,66 +0,0 @@ -# ----------------------------------------------------------------------------- -# yacc_simple.py -# -# A simple, properly specifier grammar -# ----------------------------------------------------------------------------- - -from .calclex import tokens -from ply import yacc - -# Parsing rules -precedence = ( - ('left','PLUS','MINUS'), - ('left','TIMES','DIVIDE'), - ('right','UMINUS'), - ) - -# dictionary of names -names = { } - -def p_statement_assign(t): - 'statement : NAME EQUALS expression' - names[t[1]] = t[3] - -def p_statement_expr(t): - 'statement : expression' - t[0] = t[1] - -def p_expression_binop(t): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - if t[2] == '+' : t[0] = t[1] + t[3] - elif t[2] == '-': t[0] = t[1] - t[3] - elif t[2] == '*': t[0] = t[1] * t[3] - elif t[2] == '/': t[0] = t[1] / t[3] - -def p_expression_uminus(t): - 'expression : MINUS expression %prec UMINUS' - t[0] = -t[2] - -def p_expression_group(t): - 'expression : LPAREN expression RPAREN' - t[0] = t[2] - -def p_expression_number(t): - 'expression : NUMBER' - t[0] = t[1] - -def p_expression_name(t): - 'expression : NAME' - try: - t[0] = names[t[1]] - except LookupError: - print("Undefined name '%s'" % t[1]) - t[0] = 0 - -def p_error(t): - print("Syntax error at '%s'" % t.value) - -parser = yacc.yacc(tabmodule='pkg_test3.generated.parsetab') - - - - - diff --git a/test/pkg_test4/__init__.py b/test/pkg_test4/__init__.py deleted file mode 100644 index ba9ddac..0000000 --- a/test/pkg_test4/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Tests proper handling of lextab and parsetab files in package structures -# Check of warning messages when files aren't writable - -# Here for testing purposes -import sys -if '..' not in sys.path: - sys.path.insert(0, '..') - -import ply.lex -import ply.yacc - -def patched_open(filename, mode): - if 'w' in mode: - raise IOError("Permission denied %r" % filename) - return open(filename, mode) - -ply.lex.open = patched_open -ply.yacc.open = patched_open -try: - from .parsing.calcparse import parser -finally: - del ply.lex.open - del ply.yacc.open - - diff --git a/test/pkg_test4/parsing/__init__.py b/test/pkg_test4/parsing/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/test/pkg_test4/parsing/calclex.py b/test/pkg_test4/parsing/calclex.py deleted file mode 100644 index b3c1a4d..0000000 --- a/test/pkg_test4/parsing/calclex.py +++ /dev/null @@ -1,47 +0,0 @@ -# ----------------------------------------------------------------------------- -# calclex.py -# ----------------------------------------------------------------------------- - -import ply.lex as lex - -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - -def t_newline(t): - r'\n+' - t.lexer.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -lexer = lex.lex(optimize=True) - - - diff --git a/test/pkg_test4/parsing/calcparse.py b/test/pkg_test4/parsing/calcparse.py deleted file mode 100644 index c058e9f..0000000 --- a/test/pkg_test4/parsing/calcparse.py +++ /dev/null @@ -1,66 +0,0 @@ -# ----------------------------------------------------------------------------- -# yacc_simple.py -# -# A simple, properly specifier grammar -# ----------------------------------------------------------------------------- - -from .calclex import tokens -from ply import yacc - -# Parsing rules -precedence = ( - ('left','PLUS','MINUS'), - ('left','TIMES','DIVIDE'), - ('right','UMINUS'), - ) - -# dictionary of names -names = { } - -def p_statement_assign(t): - 'statement : NAME EQUALS expression' - names[t[1]] = t[3] - -def p_statement_expr(t): - 'statement : expression' - t[0] = t[1] - -def p_expression_binop(t): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - if t[2] == '+' : t[0] = t[1] + t[3] - elif t[2] == '-': t[0] = t[1] - t[3] - elif t[2] == '*': t[0] = t[1] * t[3] - elif t[2] == '/': t[0] = t[1] / t[3] - -def p_expression_uminus(t): - 'expression : MINUS expression %prec UMINUS' - t[0] = -t[2] - -def p_expression_group(t): - 'expression : LPAREN expression RPAREN' - t[0] = t[2] - -def p_expression_number(t): - 'expression : NUMBER' - t[0] = t[1] - -def p_expression_name(t): - 'expression : NAME' - try: - t[0] = names[t[1]] - except LookupError: - print("Undefined name '%s'" % t[1]) - t[0] = 0 - -def p_error(t): - print("Syntax error at '%s'" % t.value) - -parser = yacc.yacc() - - - - - diff --git a/test/pkg_test5/__init__.py b/test/pkg_test5/__init__.py deleted file mode 100644 index 0e19558..0000000 --- a/test/pkg_test5/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Tests proper handling of lextab and parsetab files in package structures - -# Here for testing purposes -import sys -if '..' not in sys.path: - sys.path.insert(0, '..') - -from .parsing.calcparse import parser - diff --git a/test/pkg_test5/parsing/__init__.py b/test/pkg_test5/parsing/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/test/pkg_test5/parsing/calclex.py b/test/pkg_test5/parsing/calclex.py deleted file mode 100644 index e8759b6..0000000 --- a/test/pkg_test5/parsing/calclex.py +++ /dev/null @@ -1,48 +0,0 @@ -# ----------------------------------------------------------------------------- -# calclex.py -# ----------------------------------------------------------------------------- - -import ply.lex as lex - -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - -def t_newline(t): - r'\n+' - t.lexer.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -import os.path -lexer = lex.lex(optimize=True, outputdir=os.path.dirname(__file__)) - - - diff --git a/test/pkg_test5/parsing/calcparse.py b/test/pkg_test5/parsing/calcparse.py deleted file mode 100644 index 2a1ddfe..0000000 --- a/test/pkg_test5/parsing/calcparse.py +++ /dev/null @@ -1,67 +0,0 @@ -# ----------------------------------------------------------------------------- -# yacc_simple.py -# -# A simple, properly specifier grammar -# ----------------------------------------------------------------------------- - -from .calclex import tokens -from ply import yacc - -# Parsing rules -precedence = ( - ('left','PLUS','MINUS'), - ('left','TIMES','DIVIDE'), - ('right','UMINUS'), - ) - -# dictionary of names -names = { } - -def p_statement_assign(t): - 'statement : NAME EQUALS expression' - names[t[1]] = t[3] - -def p_statement_expr(t): - 'statement : expression' - t[0] = t[1] - -def p_expression_binop(t): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - if t[2] == '+' : t[0] = t[1] + t[3] - elif t[2] == '-': t[0] = t[1] - t[3] - elif t[2] == '*': t[0] = t[1] * t[3] - elif t[2] == '/': t[0] = t[1] / t[3] - -def p_expression_uminus(t): - 'expression : MINUS expression %prec UMINUS' - t[0] = -t[2] - -def p_expression_group(t): - 'expression : LPAREN expression RPAREN' - t[0] = t[2] - -def p_expression_number(t): - 'expression : NUMBER' - t[0] = t[1] - -def p_expression_name(t): - 'expression : NAME' - try: - t[0] = names[t[1]] - except LookupError: - print("Undefined name '%s'" % t[1]) - t[0] = 0 - -def p_error(t): - print("Syntax error at '%s'" % t.value) - -import os.path -parser = yacc.yacc(outputdir=os.path.dirname(__file__)) - - - - - diff --git a/test/pkg_test6/__init__.py b/test/pkg_test6/__init__.py deleted file mode 100644 index 5dbe0cb..0000000 --- a/test/pkg_test6/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Tests proper sorting of modules in yacc.ParserReflect.get_pfunctions - -# Here for testing purposes -import sys -if '..' not in sys.path: - sys.path.insert(0, '..') - -from .parsing.calcparse import parser - diff --git a/test/pkg_test6/parsing/__init__.py b/test/pkg_test6/parsing/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/test/pkg_test6/parsing/calclex.py b/test/pkg_test6/parsing/calclex.py deleted file mode 100644 index e8759b6..0000000 --- a/test/pkg_test6/parsing/calclex.py +++ /dev/null @@ -1,48 +0,0 @@ -# ----------------------------------------------------------------------------- -# calclex.py -# ----------------------------------------------------------------------------- - -import ply.lex as lex - -tokens = ( - 'NAME','NUMBER', - 'PLUS','MINUS','TIMES','DIVIDE','EQUALS', - 'LPAREN','RPAREN', - ) - -# Tokens - -t_PLUS = r'\+' -t_MINUS = r'-' -t_TIMES = r'\*' -t_DIVIDE = r'/' -t_EQUALS = r'=' -t_LPAREN = r'\(' -t_RPAREN = r'\)' -t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' - -def t_NUMBER(t): - r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 - return t - -t_ignore = " \t" - -def t_newline(t): - r'\n+' - t.lexer.lineno += t.value.count("\n") - -def t_error(t): - print("Illegal character '%s'" % t.value[0]) - t.lexer.skip(1) - -# Build the lexer -import os.path -lexer = lex.lex(optimize=True, outputdir=os.path.dirname(__file__)) - - - diff --git a/test/pkg_test6/parsing/calcparse.py b/test/pkg_test6/parsing/calcparse.py deleted file mode 100644 index 6defaf9..0000000 --- a/test/pkg_test6/parsing/calcparse.py +++ /dev/null @@ -1,33 +0,0 @@ -# ----------------------------------------------------------------------------- -# yacc_simple.py -# -# A simple, properly specifier grammar -# ----------------------------------------------------------------------------- - -from .calclex import tokens -from ply import yacc - -# Parsing rules -precedence = ( - ('left','PLUS','MINUS'), - ('left','TIMES','DIVIDE'), - ('right','UMINUS'), - ) - -# dictionary of names -names = { } - -from .statement import * - -from .expression import * - -def p_error(t): - print("Syntax error at '%s'" % t.value) - -import os.path -parser = yacc.yacc(outputdir=os.path.dirname(__file__)) - - - - - diff --git a/test/pkg_test6/parsing/expression.py b/test/pkg_test6/parsing/expression.py deleted file mode 100644 index 028f662..0000000 --- a/test/pkg_test6/parsing/expression.py +++ /dev/null @@ -1,31 +0,0 @@ -# This file contains definitions of expression grammar - -def p_expression_binop(t): - '''expression : expression PLUS expression - | expression MINUS expression - | expression TIMES expression - | expression DIVIDE expression''' - if t[2] == '+' : t[0] = t[1] + t[3] - elif t[2] == '-': t[0] = t[1] - t[3] - elif t[2] == '*': t[0] = t[1] * t[3] - elif t[2] == '/': t[0] = t[1] / t[3] - -def p_expression_uminus(t): - 'expression : MINUS expression %prec UMINUS' - t[0] = -t[2] - -def p_expression_group(t): - 'expression : LPAREN expression RPAREN' - t[0] = t[2] - -def p_expression_number(t): - 'expression : NUMBER' - t[0] = t[1] - -def p_expression_name(t): - 'expression : NAME' - try: - t[0] = names[t[1]] - except LookupError: - print("Undefined name '%s'" % t[1]) - t[0] = 0 diff --git a/test/pkg_test6/parsing/statement.py b/test/pkg_test6/parsing/statement.py deleted file mode 100644 index ef7dc55..0000000 --- a/test/pkg_test6/parsing/statement.py +++ /dev/null @@ -1,9 +0,0 @@ -# This file contains definitions of statement grammar - -def p_statement_assign(t): - 'statement : NAME EQUALS expression' - names[t[1]] = t[3] - -def p_statement_expr(t): - 'statement : expression' - t[0] = t[1] diff --git a/test/testlex.py b/test/testlex.py index a94ed64..318b47a 100755 --- a/test/testlex.py +++ b/test/testlex.py @@ -239,7 +239,7 @@ def test_lex_state4(self): self.assertRaises(SyntaxError,run_import,"lex_state4") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "State type for state comment must be 'inclusive' or 'exclusive'\n")) + "State type for state 'comment' must be 'inclusive' or 'exclusive'\n")) def test_lex_state5(self): @@ -294,13 +294,6 @@ def test_lex_token4(self): "Bad token name '-'\n")) - def test_lex_token5(self): - try: - run_import("lex_token5") - except ply.lex.LexError: - e = sys.exc_info()[1] - self.assert_(check_expected(str(e),"lex_token5.py:19: Rule 't_NUMBER' returned an unknown token type 'NUM'")) - def test_lex_token_dup(self): run_import("lex_token_dup") result = sys.stderr.getvalue() @@ -361,249 +354,7 @@ def test_lex_closure(self): "(PLUS,'+',1,1)\n" "(NUMBER,4,1,2)\n")) - def test_lex_optimize(self): - try: - os.remove("lextab.py") - except OSError: - pass - try: - os.remove("lextab.pyc") - except OSError: - pass - try: - os.remove("lextab.pyo") - except OSError: - pass - run_import("lex_optimize") - - result = sys.stdout.getvalue() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(PLUS,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - self.assert_(os.path.exists("lextab.py")) - - p = subprocess.Popen([sys.executable,'-O','lex_optimize.py'], - stdout=subprocess.PIPE) - result = p.stdout.read() - - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(PLUS,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - if test_pyo: - self.assert_(pymodule_out_exists("lextab.pyo", 1)) - pymodule_out_remove("lextab.pyo", 1) - - p = subprocess.Popen([sys.executable,'-OO','lex_optimize.py'], - stdout=subprocess.PIPE) - result = p.stdout.read() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(PLUS,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - - if test_pyo: - self.assert_(pymodule_out_exists("lextab.pyo", 2)) - try: - os.remove("lextab.py") - except OSError: - pass - try: - pymodule_out_remove("lextab.pyc") - except OSError: - pass - try: - pymodule_out_remove("lextab.pyo", 2) - except OSError: - pass - - def test_lex_optimize2(self): - try: - os.remove("opt2tab.py") - except OSError: - pass - try: - os.remove("opt2tab.pyc") - except OSError: - pass - try: - os.remove("opt2tab.pyo") - except OSError: - pass - run_import("lex_optimize2") - result = sys.stdout.getvalue() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(PLUS,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - self.assert_(os.path.exists("opt2tab.py")) - - p = subprocess.Popen([sys.executable,'-O','lex_optimize2.py'], - stdout=subprocess.PIPE) - result = p.stdout.read() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(PLUS,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - if test_pyo: - self.assert_(pymodule_out_exists("opt2tab.pyo", 1)) - pymodule_out_remove("opt2tab.pyo", 1) - p = subprocess.Popen([sys.executable,'-OO','lex_optimize2.py'], - stdout=subprocess.PIPE) - result = p.stdout.read() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(PLUS,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - if test_pyo: - self.assert_(pymodule_out_exists("opt2tab.pyo", 2)) - try: - os.remove("opt2tab.py") - except OSError: - pass - try: - pymodule_out_remove("opt2tab.pyc") - except OSError: - pass - try: - pymodule_out_remove("opt2tab.pyo", 2) - except OSError: - pass - - def test_lex_optimize3(self): - try: - shutil.rmtree("lexdir") - except OSError: - pass - - os.mkdir("lexdir") - os.mkdir("lexdir/sub") - with open("lexdir/__init__.py","w") as f: - f.write("") - with open("lexdir/sub/__init__.py","w") as f: - f.write("") - run_import("lex_optimize3") - result = sys.stdout.getvalue() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(PLUS,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - self.assert_(os.path.exists("lexdir/sub/calctab.py")) - - p = subprocess.Popen([sys.executable,'-O','lex_optimize3.py'], - stdout=subprocess.PIPE) - result = p.stdout.read() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(PLUS,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - if test_pyo: - self.assert_(pymodule_out_exists("lexdir/sub/calctab.pyo", 1)) - pymodule_out_remove("lexdir/sub/calctab.pyo", 1) - - p = subprocess.Popen([sys.executable,'-OO','lex_optimize3.py'], - stdout=subprocess.PIPE) - result = p.stdout.read() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(PLUS,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - if test_pyo: - self.assert_(pymodule_out_exists("lexdir/sub/calctab.pyo", 2)) - try: - shutil.rmtree("lexdir") - except OSError: - pass - - def test_lex_optimize4(self): - - # Regression test to make sure that reflags works correctly - # on Python 3. - - for extension in ['py', 'pyc']: - try: - os.remove("opt4tab.{0}".format(extension)) - except OSError: - pass - - run_import("lex_optimize4") - run_import("lex_optimize4") - - for extension in ['py', 'pyc']: - try: - os.remove("opt4tab.{0}".format(extension)) - except OSError: - pass - - def test_lex_opt_alias(self): - try: - os.remove("aliastab.py") - except OSError: - pass - try: - os.remove("aliastab.pyc") - except OSError: - pass - try: - os.remove("aliastab.pyo") - except OSError: - pass - run_import("lex_opt_alias") - result = sys.stdout.getvalue() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(+,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - self.assert_(os.path.exists("aliastab.py")) - - p = subprocess.Popen([sys.executable,'-O','lex_opt_alias.py'], - stdout=subprocess.PIPE) - result = p.stdout.read() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(+,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - if test_pyo: - self.assert_(pymodule_out_exists("aliastab.pyo", 1)) - pymodule_out_remove("aliastab.pyo", 1) - - p = subprocess.Popen([sys.executable,'-OO','lex_opt_alias.py'], - stdout=subprocess.PIPE) - result = p.stdout.read() - self.assert_(check_expected(result, - "(NUMBER,3,1,0)\n" - "(+,'+',1,1)\n" - "(NUMBER,4,1,2)\n")) - - if test_pyo: - self.assert_(pymodule_out_exists("aliastab.pyo", 2)) - try: - os.remove("aliastab.py") - except OSError: - pass - try: - pymodule_out_remove("aliastab.pyc") - except OSError: - pass - try: - pymodule_out_remove("aliastab.pyo", 2) - except OSError: - pass - def test_lex_many_tokens(self): - try: - os.remove("manytab.py") - except OSError: - pass - try: - os.remove("manytab.pyc") - except OSError: - pass - try: - os.remove("manytab.pyo") - except OSError: - pass run_import("lex_many_tokens") result = sys.stdout.getvalue() self.assert_(check_expected(result, @@ -615,37 +366,6 @@ def test_lex_many_tokens(self): "(TOK561,'TOK561:',1,39)\n" "(TOK999,'TOK999:',1,47)\n" )) - - self.assert_(os.path.exists("manytab.py")) - - if implementation() == 'CPython': - p = subprocess.Popen([sys.executable,'-O','lex_many_tokens.py'], - stdout=subprocess.PIPE) - result = p.stdout.read() - self.assert_(check_expected(result, - "(TOK34,'TOK34:',1,0)\n" - "(TOK143,'TOK143:',1,7)\n" - "(TOK269,'TOK269:',1,15)\n" - "(TOK372,'TOK372:',1,23)\n" - "(TOK452,'TOK452:',1,31)\n" - "(TOK561,'TOK561:',1,39)\n" - "(TOK999,'TOK999:',1,47)\n" - )) - - self.assert_(pymodule_out_exists("manytab.pyo", 1)) - pymodule_out_remove("manytab.pyo", 1) - try: - os.remove("manytab.py") - except OSError: - pass - try: - os.remove("manytab.pyc") - except OSError: - pass - try: - os.remove("manytab.pyo") - except OSError: - pass # Tests related to run-time behavior of lexers class LexRunTests(unittest.TestCase): diff --git a/test/testyacc.py b/test/testyacc.py index 7e69f09..96d4b0d 100644 --- a/test/testyacc.py +++ b/test/testyacc.py @@ -401,52 +401,4 @@ def test_yacc_prec1(self): "Precedence rule 'left' defined for unknown symbol '/'\n" )) - def test_pkg_test1(self): - from pkg_test1 import parser - self.assertTrue(os.path.exists('pkg_test1/parsing/parsetab.py')) - self.assertTrue(os.path.exists('pkg_test1/parsing/lextab.py')) - self.assertTrue(os.path.exists('pkg_test1/parsing/parser.out')) - r = parser.parse('3+4+5') - self.assertEqual(r, 12) - - def test_pkg_test2(self): - from pkg_test2 import parser - self.assertTrue(os.path.exists('pkg_test2/parsing/calcparsetab.py')) - self.assertTrue(os.path.exists('pkg_test2/parsing/calclextab.py')) - self.assertTrue(os.path.exists('pkg_test2/parsing/parser.out')) - r = parser.parse('3+4+5') - self.assertEqual(r, 12) - - def test_pkg_test3(self): - from pkg_test3 import parser - self.assertTrue(os.path.exists('pkg_test3/generated/parsetab.py')) - self.assertTrue(os.path.exists('pkg_test3/generated/lextab.py')) - self.assertTrue(os.path.exists('pkg_test3/generated/parser.out')) - r = parser.parse('3+4+5') - self.assertEqual(r, 12) - - def test_pkg_test4(self): - from pkg_test4 import parser - self.assertFalse(os.path.exists('pkg_test4/parsing/parsetab.py')) - self.assertFalse(os.path.exists('pkg_test4/parsing/lextab.py')) - self.assertFalse(os.path.exists('pkg_test4/parsing/parser.out')) - r = parser.parse('3+4+5') - self.assertEqual(r, 12) - - def test_pkg_test5(self): - from pkg_test5 import parser - self.assertTrue(os.path.exists('pkg_test5/parsing/parsetab.py')) - self.assertTrue(os.path.exists('pkg_test5/parsing/lextab.py')) - self.assertTrue(os.path.exists('pkg_test5/parsing/parser.out')) - r = parser.parse('3+4+5') - self.assertEqual(r, 12) - - def test_pkg_test6(self): - from pkg_test6 import parser - self.assertTrue(os.path.exists('pkg_test6/parsing/parsetab.py')) - self.assertTrue(os.path.exists('pkg_test6/parsing/lextab.py')) - self.assertTrue(os.path.exists('pkg_test6/parsing/parser.out')) - r = parser.parse('3+4+5') - self.assertEqual(r, 12) - unittest.main() diff --git a/test/yacc_error7.py b/test/yacc_error7.py index fb131be..abdc834 100644 --- a/test/yacc_error7.py +++ b/test/yacc_error7.py @@ -56,11 +56,11 @@ def p_error(p): print("Line %d: Syntax error at '%s'" % (p.lineno, p.value)) # Scan ahead looking for a name token while True: - tok = yacc.token() + tok = parser.token() if not tok or tok.type == 'RPAREN': break if tok: - yacc.restart() + parser.restart() return None parser = yacc.yacc() diff --git a/test/yacc_nested.py b/test/yacc_nested.py index a1b061e..a3543a9 100644 --- a/test/yacc_nested.py +++ b/test/yacc_nested.py @@ -26,7 +26,7 @@ def p_nest(t): '''nest : B''' print(t[-1]) -the_parser = yacc.yacc(debug = False, write_tables = False) +the_parser = yacc.yacc(debug = False) the_parser.parse('ABC', the_lexer) the_parser.parse('ABC', the_lexer, tracking=True)