diff --git a/CHANGES b/CHANGES index b82acb0..784784c 100644 --- a/CHANGES +++ b/CHANGES @@ -1,5 +1,45 @@ -Version 2.6 +Version 3.0 ----------------------------- +01/13/09: beazley + Minor change to the procedure for signalling a syntax error in a + production rule. A normal SyntaxError exception should be raised + instead of yacc.SyntaxError. + +01/13/09: beazley + Added a new method p.set_lineno(n,lineno) that can be used to set the + line number of symbol n in grammar rules. This simplifies manual + tracking of line numbers. + +01/11/09: beazley + Vastly improved debugging support for yacc.parse(). Instead of passing + debug as an integer, you can supply a Logging object (see the logging + module). Messages will be generated at the ERROR, INFO, and DEBUG + logging levels, each level providing progressively more information. + The debugging trace also shows states, grammar rule, values passed + into grammar rules, and the result of each reduction. + +01/09/09: beazley + The yacc() command now does all error-reporting and diagnostics using + the interface of the logging module. Use the errorlog parameter to + specify a logging object for error messages. Use the debuglog parameter + to specify a logging object for the 'parser.out' output. + +01/09/09: beazley + *HUGE* refactoring of the the ply.yacc() implementation. The high-level + user interface is backwards compatible, but the internals are completely + reorganized into classes. No more global variables. The internals + are also more extensible. For example, you can use the classes to + construct a LALR(1) parser in an entirely different manner than + what is currently the case. Documentation is forthcoming. + +01/07/09: beazley + Various cleanup and refactoring of yacc internals. + +01/06/09: beazley + Fixed a bug with precedence assignment. yacc was assigning the precedence + each rule based on the left-most token, when in fact, it should have been + using the right-most token. Reported by Bruce Frederiksen. + 11/27/08: beazley Numerous changes to support Python 3.0 including removal of deprecated statements (e.g., has_key) and the additional of compatibility code diff --git a/doc/internal.html b/doc/internal.html new file mode 100644 index 0000000..9192bcb --- /dev/null +++ b/doc/internal.html @@ -0,0 +1,851 @@ + +
++PLY Version: 3.0 +
+ + + + +
+It should be stressed that using PLY at this level is not for the +faint of heart. Generally, it's assumed that you know a bit of +the underlying compiler theory and how an LR parser is put together. + +
+Grammar(terminals) + +
+Creates a new grammar object. terminals is a list of strings +specifying the terminals for the grammar. An instance g of +Grammar has the following methods: ++ +
+g.set_precedence(term,assoc,level) +
+Sets the precedence level and associativity for a given terminal term. +assoc is one of 'right', +'left', or 'nonassoc' and level is a positive integer. The higher +the value of level, the higher the precedence. Here is an example of typical +precedence settings: + +++g.set_precedence('PLUS', 'left',1) +g.set_precedence('MINUS', 'left',1) +g.set_precedence('TIMES', 'left',2) +g.set_precedence('DIVIDE','left',2) +g.set_precedence('UMINUS','left',3) ++ +This method must be called prior to adding any productions to the +grammar with g.add_production(). The precedence of individual grammar +rules is determined by the precedence of the right-most terminal. + +
+g.add_production(name,syms,func=None,file='',line=0) +
+Adds a new grammar rule. name is the name of the rule, +syms is a list of symbols making up the right hand +side of the rule, func is the function to call when +reducing the rule. file and line specify +the filename and line number of the rule and are used for +generating error messages. + ++ ++The list of symbols in syms may include character +literals and %prec specifiers. Here are some +examples: + +
+g.add_production('expr',['expr','PLUS','term'],func,file,line) +g.add_production('expr',['expr','"+"','term'],func,file,line) +g.add_production('expr',['MINUS','expr','%prec','UMINUS'],func,file,line) ++ ++If any kind of error is detected, a GrammarError exception +is raised with a message indicating the reason for the failure. +
+g.set_start(start=None) +
+Sets the starting rule for the grammar. start is a string +specifying the name of the start rule. If start is omitted, +the first grammar rule added with add_production() is taken to be +the starting rule. This method must always be called after all +productions have been added. ++ +
+g.find_unreachable() +
+Diagnostic function. Returns a list of all unreachable non-terminals +defined in the grammar. This is used to identify inactive parts of +the grammar specification. ++ +
+g.infinite_cycle() +
+Diagnostic function. Returns a list of all non-terminals in the +grammar that result in an infinite cycle. This condition occurs if +there is no way for a grammar rule to expand to a string containing +only terminal symbols. ++ +
+g.undefined_symbols() +
+Diagnostic function. Returns a list of tuples (name, prod) +corresponding to undefined symbols in the grammar. name is the +name of the undefined symbol and prod is an instance of +Production which has information about the production rule +where the undefined symbol was used. ++ +
+g.unused_terminals() +
+Diagnostic function. Returns a list of terminals that were defined, +but never used in the grammar. ++ +
+g.unused_rules() +
+Diagnostic function. Returns a list of Production instances +corresponding to production rules that were defined in the grammar, +but never used anywhere. This is slightly different +than find_unreachable(). ++ +
+g.unused_precedence() +
+Diagnostic function. Returns a list of tuples (term, assoc) +corresponding to precedence rules that were set, but never used the +grammar. term is the terminal name and assoc is the +precedence associativity (e.g., 'left', 'right', +or 'nonassoc'. ++ +
+g.compute_first() +
+Compute all of the first sets for all symbols in the grammar. Returns a dictionary +mapping symbol names to a list of all first symbols. ++ +
+g.compute_follow() +
+Compute all of the follow sets for all non-terminals in the grammar. +The follow set is the set of all possible symbols that might follow a +given non-terminal. Returns a dictionary mapping non-terminal names +to a list of symbols. ++ +
+g.build_lritems() +
+Calculates all of the LR items for all productions in the grammar. This +step is required before using the grammar for any kind of table generation. +See the section on LR items below. ++ +
+The following attributes are set by the above methods and may be useful +in code that works with the grammar. All of these attributes should be +assumed to be read-only. Changing their values directly will likely +break the grammar. + +
+g.Productions +
+A list of all productions added. The first entry is reserved for +a production representing the starting rule. The objects in this list +are instances of the Production class, described shortly. ++ +
+g.Prodnames +
+A dictionary mapping the names of nonterminals to a list of all +productions of that nonterminal. ++ +
+g.Terminals +
+A dictionary mapping the names of terminals to a list of the +production numbers where they are used. ++ +
+g.Nonterminals +
+A dictionary mapping the names of nonterminals to a list of the +production numbers where they are used. ++ +
+g.First +
+A dictionary representing the first sets for all grammar symbols. This is +computed and returned by the compute_first() method. ++ +
+g.Follow +
+A dictionary representing the follow sets for all grammar rules. This is +computed and returned by the compute_follow() method. ++ +
+g.Start +
+Starting symbol for the grammar. Set by the set_start() method. ++ +For the purposes of debugging, a Grammar object supports the __len__() and +__getitem__() special methods. Accessing g[n] returns the nth production +from the grammar. + + +
+p.name +
+The name of the production. For a grammar rule such as A : B C D, this is 'A'. ++ +
+p.prod +
+A tuple of symbols making up the right-hand side of the production. For a grammar rule such as A : B C D, this is ('B','C','D'). ++ +
+p.number +
+Production number. An integer containing the index of the production in the grammar's Productions list. ++ +
+p.func +
+The name of the reduction function associated with the production. +This is the function that will execute when reducing the entire +grammar rule during parsing. ++ +
+p.callable +
+The callable object associated with the name in p.func. This is None +unless the production has been bound using bind(). ++ +
+p.file +
+Filename associated with the production. Typically this is the file where the production was defined. Used for error messages. ++ +
+p.lineno +
+Line number associated with the production. Typically this is the line number in p.file where the production was defined. Used for error messages. ++ +
+p.prec +
+Precedence and associativity associated with the production. This is a tuple (assoc,level) where +assoc is one of 'left','right', or 'nonassoc' and level is +an integer. This value is determined by the precedence of the right-most terminal symbol in the production +or by use of the %prec specifier when adding the production. ++ +
+p.usyms +
+A list of all unique symbols found in the production. ++ +
+p.lr_items +
+A list of all LR items for this production. This attribute only has a meaningful value if the +Grammar.build_lritems() method has been called. The items in this list are +instances of LRItem described below. ++ +
+p.lr_next +
+The head of a linked-list representation of the LR items in p.lr_items. +This attribute only has a meaningful value if the Grammar.build_lritems() +method has been called. Each LRItem instance has a lr_next attribute +to move to the next item. The list is terminated by None. ++ +
+p.bind(dict) +
+Binds the production function name in p.func to a callable object in +dict. This operation is typically carried out in the last step +prior to running the parsing engine and is needed since parsing tables are typically +read from files which only include the function names, not the functions themselves. ++ +
+Production objects support +the __len__(), __getitem__(), and __str__() +special methods. +len(p) returns the number of symbols in p.prod +and p[n] is the same as p.prod[n]. + +
+Here is an interactive example that shows what LR items look like if you +interactively experiment. In this example, g is a Grammar +object. + +
++ +In the above code, p represents the first grammar rule. In +this case, a rule 'statement -> ID = expr'. + ++>>> g.build_lritems() +>>> p = g[1] +>>> p +Production(statement -> ID = expr) +>>> ++
+Now, let's look at the LR items for p. + +
++ +In each LR item, the dot (.) represents a specific stage of parsing. In each LR item, the dot +is advanced by one symbol. It is only when the dot reaches the very end that a production +is successfully parsed. + ++>>> p.lr_items +[LRItem(statement -> . ID = expr), + LRItem(statement -> ID . = expr), + LRItem(statement -> ID = . expr), + LRItem(statement -> ID = expr .)] +>>> ++
+An instance lr of LRItem has the following +attributes that hold information related to that specific stage of +parsing. + +
+lr.name +
+The name of the grammar rule. For example, 'statement' in the above example. ++ +
+lr.prod +
+A tuple of symbols representing the right-hand side of the production, including the +special '.' character. For example, ('ID','.','=','expr'). ++ +
+lr.number +
+An integer representing the production number in the grammar. ++ +
+lr.usyms +
+A set of unique symbols in the production. Inherited from the original Production instance. ++ +
+lr.lr_index +
+An integer representing the position of the dot (.). You should never use lr.prod.index() +to search for it--the result will be wrong if the grammar happens to also use (.) as a character +literal. ++ +
+lr.lr_after +
+A list of all productions that can legally appear immediately to the right of the +dot (.). This list contains Production instances. This attribute +represents all of the possible branches a parse can take from the current position. +For example, suppose that lr represents a stage immediately before +an expression like this: + ++ ++>>> lr +LRItem(statement -> ID = . expr) +>>> ++ +Then, the value of lr.lr_after might look like this, showing all productions that +can legally appear next: + ++>>> lr.lr_after +[Production(expr -> expr PLUS expr), + Production(expr -> expr MINUS expr), + Production(expr -> expr TIMES expr), + Production(expr -> expr DIVIDE expr), + Production(expr -> MINUS expr), + Production(expr -> LPAREN expr RPAREN), + Production(expr -> NUMBER), + Production(expr -> ID)] +>>> ++ +
+lr.lr_before +
+The grammar symbol that appears immediately before the dot (.) or None if +at the beginning of the parse. ++ +
+lr.lr_next +
+A link to the next LR item, representing the next stage of the parse. None if lr +is the last LR item. ++ +LRItem instances also support the __len__() and __getitem__() special methods. +len(lr) returns the number of items in lr.prod including the dot (.). lr[n] +returns lr.prod[n]. + +
+It goes without saying that all of the attributes associated with LR +items should be assumed to be read-only. Modifications will very +likely create a small black-hole that will consume you and your code. + +
+LRTable() +
+Create an empty LRTable object. This object contains only the information needed to +run an LR parser. ++ +An instance lrtab of LRTable has the following methods: + +
+lrtab.read_table(module) +
+Populates the LR table with information from the module specified in module. +module is either a module object already loaded with import or +the name of a Python module. If it's a string containing a module name, it is +loaded and parsing data is extracted. Returns the signature value that was used +when initially writing the tables. Raises a VersionError exception if +the module was created using an incompatible version of PLY. ++ +
+lrtab.bind_callables(dict) +
+This binds all of the function names used in productions to callable objects +found in the dictionary dict. During table generation and when reading +LR tables from files, PLY only uses the names of action functions such as 'p_expr', +'p_statement', etc. In order to actually run the parser, these names +have to be bound to callable objects. This method is always called prior to +running a parser. ++ +After lrtab has been populated, the following attributes are defined. + +
+lrtab.lr_method +
+The LR parsing method used (e.g., 'LALR') ++ + +
+lrtab.lr_productions +
+The production list. If the parsing tables have been newly +constructed, this will be a list of Production instances. If +the parsing tables have been read from a file, it's a list +of MiniProduction instances. This, together +with lr_action and lr_goto contain all of the +information needed by the LR parsing engine. ++ +
+lrtab.lr_action +
+The LR action dictionary that implements the underlying state machine. +The keys of this dictionary are the LR states. ++ +
+lrtab.lr_goto +
+The LR goto table that contains information about grammar rule reductions. ++ + +
+LRGeneratedTable(grammar, method='LALR',log=None) +
+Create the LR parsing tables on a grammar. grammar is an instance of Grammar, +method is a string with the parsing method ('SLR' or 'LALR'), and +log is a logger object used to write debugging information. The debugging information +written to log is the same as what appears in the parser.out file created +by yacc. By supplying a custom logger with a different message format, it is possible to get +more information (e.g., the line number in yacc.py used for issuing each line of +output in the log). The result is an instance of LRGeneratedTable. ++ +
+An instance lr of LRGeneratedTable has the following attributes. + +
+lr.grammar +
+A link to the Grammar object used to construct the parsing tables. ++ +
+lr.lr_method +
+The LR parsing method used (e.g., 'LALR') ++ + +
+lr.lr_productions +
+A reference to grammar.Productions. This, together with lr_action and lr_goto +contain all of the information needed by the LR parsing engine. ++ +
+lr.lr_action +
+The LR action dictionary that implements the underlying state machine. The keys of this dictionary are +the LR states. ++ +
+lr.lr_goto +
+The LR goto table that contains information about grammar rule reductions. ++ +
+lr.sr_conflicts +
+A list of tuples (state,token,resolution) identifying all shift/reduce conflicts. state is the LR state +number where the conflict occurred, token is the token causing the conflict, and resolution is +a string describing the resolution taken. resolution is either 'shift' or 'reduce'. ++ +
+lr.rr_conflicts +
+A list of tuples (state,rule,rejected) identifying all reduce/reduce conflicts. state is the +LR state number where the conflict occurred, rule is the production rule that was selected +and rejected is the production rule that was rejected. Both rule and rejected are +instances of Production. They can be inspected to provide the user with more information. ++ +
+There are two public methods of LRGeneratedTable. + +
+lr.write_table(modulename,outputdir="",signature="") +
+Writes the LR parsing table information to a Python module. modulename is a string +specifying the name of a module such as "parsetab". outputdir is the name of a +directory where the module should be created. signature is a string representing a +grammar signature that's written into the output file. This can be used to detect when +the data stored in a module file is out-of-sync with the the grammar specification (and that +the tables need to be regenerated). If modulename is a string "parsetab", +this function creates a file called parsetab.py. If the module name represents a +package such as "foo.bar.parsetab", then only the last component, "parsetab" is +used. ++ + +
+LRParser(lrtab, error_func) +
+Create an LRParser. lrtab is an instance of LRTable +containing the LR production and state tables. error_func is the +error function to invoke in the event of a parsing error. ++ +An instance p of LRParser has the following methods: + +
+p.parse(input=None,lexer=None,debug=0,tracking=0,tokenfunc=None) +
+Run the parser. input is a string, which if supplied is fed into the +lexer using its input() method. lexer is an instance of the +Lexer class to use for tokenizing. If not supplied, the last lexer +created with the lex module is used. debug is a boolean flag +that enables debugging. tracking is a boolean flag that tells the +parser to perform additional line number tracking. tokenfunc is a callable +function that returns the next token. If supplied, the parser will use it to get +all tokens. ++ +
+p.restart() +
+Resets the parser state for a parse already in progress. ++ +
+The ParserReflect class is used to collect parser specification data +from a Python module or object. This class is what collects all of the +p_rule() functions in a PLY file, performs basic error checking, +and collects all of the needed information to build a grammar. Most of the +high-level PLY interface as used by the yacc() function is actually +implemented by this class. + +
+ParserReflect(pdict, log=None) +
+Creates a ParserReflect instance. pdict is a dictionary +containing parser specification data. This dictionary typically corresponds +to the module or class dictionary of code that implements a PLY parser. +log is a logger instance that will be used to report error +messages. ++ +An instance p of ParserReflect has the following methods: + +
+p.get_all() +
+Collect and store all required parsing information. ++ +
+p.validate_all() +
+Validate all of the collected parsing information. This is a seprate step +from p.get_all() as a performance optimization. In order to +increase parser start-up time, a parser can elect to only validate the +parsing data when regenerating the parsing tables. The validation +step tries to collect as much information as possible rather than +raising an exception at the first sign of trouble. The attribute +p.error is set if there are any validation errors. The +value of this attribute is also returned. ++ +
+p.signature() +
+Compute a signature representing the contents of the collected parsing +data. The signature value should change if anything in the parser +specification has changed in a way that would justify parser table +regeneration. This method can be called after p.get_all(), +but before p.validate_all(). ++ +The following attributes are set in the process of collecting data: + +
+p.start +
+The grammar start symbol, if any. Taken from pdict['start']. ++ +
+p.error_func +
+The error handling function or None. Taken from pdict['p_error']. ++ +
+p.tokens +
+The token list. Taken from pdict['tokens']. ++ +
+p.prec +
+The precedence specifier. Taken from pdict['precedence']. ++ +
+p.preclist +
+A parsed version of the precedence specified. A list of tuples of the form +(token,assoc,level) where token is the terminal symbol, +assoc is the associativity (e.g., 'left') and level +is a numeric precedence level. ++ +
+p.grammar +
+A list of tuples (name, rules) representing the grammar rules. name is the +name of a Python function or method in pdict that starts with "p_". +rules is a list of tuples (filename,line,prodname,syms) representing +the grammar rules found in the documentation string of that function. filename and line contain location +information that can be used for debugging. prodname is the name of the +production. syms is the right-hand side of the production. If you have a +function like this + ++ ++def p_expr(p): + '''expr : expr PLUS expr + | expr MINUS expr + | expr TIMES expr + | expr DIVIDE expr''' ++ +then the corresponding entry in p.grammar might look like this: + ++('p_expr', [ ('calc.py',10,'expr', ['expr','PLUS','expr']), + ('calc.py',11,'expr', ['expr','MINUS','expr']), + ('calc.py',12,'expr', ['expr','TIMES','expr']), + ('calc.py',13,'expr', ['expr','DIVIDE','expr']) + ]) ++
+p.pfuncs +
+A sorted list of tuples (line, file, name, doc) representing all of +the p_ functions found. line and file give location +information. name is the name of the function. doc is the +documentation string. This list is sorted in ascending order by line number. ++ +
+p.files +
+A dictionary holding all of the source filenames that were encountered +while collecting parser information. Only the keys of this dictionary have +any meaning. ++ +
+p.error +
+An attribute that indicates whether or not any critical errors +occurred in validation. If this is set, it means that that some kind +of problem was detected and that no further processing should be +performed. ++ + +
-PLY Version: 2.5 +PLY Version: 3.0
@@ -97,7 +97,10 @@
Since PLY was primarily developed as an instructional tool, you will @@ -245,11 +248,7 @@
-The tokens returned by lex.token() are instances +Lexers also support the iteration protocol. So, you can write the above loop as follows: + +@@ -282,11 +284,11 @@@@ -310,7 +312,16 @@3.1 Lex Example
''' # Give the lexer some input -lex.input(data) +lexer.input(data) # Tokenize -while 1: - tok = lex.token() +while True: + tok = lexer.token() if not tok: break # No more input print tok3.1 Lex Example
++ +The tokens returned by lexer.token() are instances of LexToken. This object has attributes tok.type, tok.value, tok.lineno, and tok.lexpos. The following code shows an example of @@ -319,8 +330,8 @@+for tok in lexer: + print tok ++
@@ -607,36 +617,34 @@# Tokenize -while 1: - tok = lex.token() +while True: + tok = lexer.token() if not tok: break # No more input print tok.type, tok.value, tok.line, tok.lexpos@@ -429,7 +440,7 @@3.3 Specification of tokens
... } -tokens = ['LPAREN','RPAREN',...,'ID'] + reserved.values() +tokens = ['LPAREN','RPAREN',...,'ID'] + list(reserved.values()) def t_ID(t): r'[a-zA-Z_][a-zA-Z_0-9]*' @@ -530,11 +541,10 @@3.6 Line numbers and positional information
# input is the input text string # token is a token instance def find_column(input,token): - i = token.lexpos - while i > 0: - if input[i] == '\n': break - i -= 1 - column = (token.lexpos - i)+1 + last_cr = input.rfind('\n',0,token.lexpos) + if last_cr < 0: + last_cr = 0 + column = (token.lexpos - last_cr) + 1 return column
To build the lexer, the function lex.lex() is used. This function uses Python reflection (or introspection) to read the the regular expression rules -out of the calling context and build the lexer. Once the lexer has been built, two functions can +out of the calling context and build the lexer. Once the lexer has been built, two methods can be used to control the lexer.
-lexer = lex.lex() -lexer.input(sometext) +lex.lex() +lex.input(sometext) while 1: - tok = lexer.token() + tok = lex.token() if not tok: break print tok
-This latter technique should be used if you intend to use multiple lexers in your application. Simply define each -lexer in its own module and use the object returned by lex() as appropriate. - -
-Note: The global functions lex.input() and lex.token() are bound to the input() -and token() methods of the last lexer created by the lex module. +In this example, the module-level functions lex.input() and lex.token() are bound to the input() +and token() methods of the last lexer created by the lex module. This interface may go away at some point so +it's probably best not to use it.
-When building a lexer from class, you should construct the lexer from -an instance of the class, not the class object itself. Also, for -reasons that are subtle, you should NOT -invoke lex.lex() inside the __init__() method of -your class. If you do, it may cause bizarre behavior if someone tries -to duplicate a lexer object. + +When building a lexer from class, you should construct the lexer from +an instance of the class, not the class object itself. This is because +PLY only works properly if the lexer actions are defined by bound-methods. + +@@ -856,11 +860,7 @@3.14 Alternative specification of lexers
# Note addition of self parameter since we're in a class def t_NUMBER(self,t): r'\d+' - try: - t.value = int(t.value) - except ValueError: - print "Line %d: Number %s is too large!" % (t.lineno,t.value) - t.value = 0 + t.value = int(t.value) return t # Define a rule so we can track line numbers @@ -878,12 +878,12 @@3.14 Alternative specification of lexers
# Build the lexer def build(self,**kwargs): - self.lexer = lex.lex(object=self, **kwargs) + self.lexer = lex.lex(module=self, **kwargs) # Test it output def test(self,data): self.lexer.input(data) - while 1: + while True: tok = lexer.token() if not tok: break print tok @@ -895,18 +895,80 @@3.14 Alternative specification of lexers
+When using the module option to lex(), PLY collects symbols +from the underlying object using the dir() function. There is no +direct access to the __dict__ attribute of the object supplied as a +module value. + +
+Finally, if you want to keep things nicely encapsulated, but don't want to use a +full-fledged class definition, lexers can be defined using closures. For example: + +
+++import ply.lex as lex + +# List of token names. This is always required +tokens = ( + 'NUMBER', + 'PLUS', + 'MINUS', + 'TIMES', + 'DIVIDE', + 'LPAREN', + 'RPAREN', +) + +def MyLexer(): + # Regular expression rules for simple tokens + t_PLUS = r'\+' + t_MINUS = r'-' + t_TIMES = r'\*' + t_DIVIDE = r'/' + t_LPAREN = r'\(' + t_RPAREN = r'\)' + + # A regular expression rule with some action code + def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + + # Define a rule so we can track line numbers + def t_newline(t): + r'\n+' + t.lexer.lineno += len(t.value) + + # A string containing ignored characters (spaces and tabs) + t_ignore = ' \t' + + # Error handling rule + def t_error(t): + print "Illegal character '%s'" % t.value[0] + t.lexer.skip(1) + + # Build the lexer from my environment and return it + return lex.lex() ++
+One way to do this is to keep a set of global variables in the module +where you created the lexer. For example:
-Alternatively, you can store this information inside the Lexer object created by lex(). To this, you can use the lexer attribute -of tokens passed to the various rules. For example: +If you don't like the use of a global variable, another place to store +information is inside the Lexer object created by lex(). +To this, you can use the lexer attribute of tokens passed to +the various rules. For example:@@ -915,28 +977,22 @@3.15 Maintaining state
r'\d+' global num_count num_count += 1 - try: - t.value = int(t.value) - except ValueError: - print "Line %d: Number %s is too large!" % (t.lineno,t.value) - t.value = 0 + t.value = int(t.value) return t
-This latter approach has the advantage of storing information inside -the lexer object itself---something that may be useful if multiple instances -of the same lexer have been created. However, it may also feel kind -of "hacky" to the OO purists. Just to put their mind at some ease, all +This latter approach has the advantage of being simple and working +correctly in applications where multiple instantiations of a given +lexer exist in the same application. However, this might also feel +like a gross violation of encapsulation to OO purists. +Just to put your mind at some ease, all internal attributes of the lexer (with the exception of lineno) have names that are prefixed by lex (e.g., lexdata,lexpos, etc.). Thus, -it should be perfectly safe to store attributes in the lexer that -don't have names starting with that prefix. +it is perfectly safe to store attributes in the lexer that +don't have names starting with that prefix or a name that conlicts with one of the +predefined methods (e.g., input(), token(), etc.).def t_NUMBER(t): r'\d+' t.lexer.num_count += 1 # Note use of lexer attribute - try: - t.value = int(t.value) - except ValueError: - print "Line %d: Number %s is too large!" % (t.lineno,t.value) - t.value = 0 + t.value = int(t.value) return t lexer = lex.lex() @@ -944,17 +1000,20 @@3.15 Maintaining state
-A third approach is to define the lexer as a class as shown in the previous example: +If you don't like assigning values on the lexer object, you can define your lexer as a class as +shown in the previous section:
@@ -986,10 +1037,28 @@@@ -963,11 +1022,7 @@3.15 Maintaining state
def t_NUMBER(self,t): r'\d+' self.num_count += 1 - try: - t.value = int(t.value) - except ValueError: - print "Line %d: Number %s is too large!" % (t.lineno,t.value) - t.value = 0 + t.value = int(t.value) return t def build(self, **kwargs): @@ -975,10 +1030,6 @@3.15 Maintaining state
def __init__(self): self.num_count = 0 - -# Create a lexer -m = MyLexer() -lexer = lex.lex(object=m)
+State can also be managed through closures. For example, in Python 3: + +
+++def MyLexer(): + num_count = 0 + ... + def t_NUMBER(t): + r'\d+' + nonlocal num_count + num_count += 1 + t.value = int(t.value) + return t + ... ++
-If necessary, a lexer object can be quickly duplicated by invoking its clone() method. For example: +If necessary, a lexer object can be duplicated by invoking its clone() method. For example:
-A good way to think about syntax directed translation is to simply think of each symbol in the grammar as some -kind of object. The semantics of the language are then expressed as a collection of methods/operations on these -objects. +A good way to think about syntax directed translation is to +view each symbol in the grammar as a kind of object. Associated +with each symbol is a value representing its "state" (for example, the +val attribute above). Semantic +actions are then expressed as a collection of functions or methods +that operate on the symbols and associated values.@@ -1009,9 +1078,15 @@3.16 Lexer cloning
cloned lexers could be used to handle different input files.-Special considerations need to be made when cloning lexers that also maintain their own -internal state. Namely, you need to be aware that the newly created lexers will share all -of this state with the original lexer. For example, if you defined a lexer as a class and did this: +Creating a clone is different than calling lex.lex() in that +PLY doesn't regenerate any of the internal tables or regular expressions. So, + +
+Special considerations need to be made when cloning lexers that also +maintain their own internal state using classes or closures. Namely, +you need to be aware that the newly created lexers will share all of +this state with the original lexer. For example, if you defined a +lexer as a class and did this:
In the grammar, symbols such as NUMBER, +, -, *, and / are known -as terminals and correspond to raw input tokens. Identifiers such as term and factor refer to more -complex rules, typically comprised of a collection of tokens. These identifiers are known as non-terminals. +as terminals and correspond to raw input tokens. Identifiers such as term and factor refer to +grammar rules comprised of a collection of terminals and other rules. These identifiers are known as non-terminals.@@ -1024,8 +1099,9 @@3.16 Lexer cloning
Then both a and b are going to be bound to the same object m and any changes to m will be reflected in both lexers. It's -important to emphasize that clone() is not meant to make a totally new copy of a -lexer. If you want to do that, call lex() again to create a new lexer. +important to emphasize that clone() is only meant to create a new lexer +that reuses the regular expressions and environment of another lexer. If you +need to make a totally new copy of a lexer, then call lex() again.3.17 Internal lexer state
@@ -1045,8 +1121,9 @@3.17 Internal lexer state
lexer.lineno
-The current value of the line number attribute stored in the lexer. This can be modified as needed to -change the line number. +The current value of the line number attribute stored in the lexer. PLY only specifies that the attribute +exists---it never sets, updates, or performs any processing with it. If you want to track line numbers, +you will need to add code yourself (see the section on line numbers and positional information).@@ -1066,7 +1143,6 @@
3.17 Internal lexer state
3.18 Conditional lexing and start conditions
- In advanced parsing applications, it may be useful to have different lexing states. For instance, you may want the occurrence of a certain token or syntactic construct to trigger a different kind of lexing. @@ -1329,9 +1405,10 @@4. Parsing basics
+ The semantic behavior of a language is often specified using a technique known as syntax directed translation. In syntax directed translation, attributes are attached to each symbol in a given grammar @@ -1357,9 +1434,12 @@
4. Parsing basics
Yacc uses a parsing technique known as LR-parsing or shift-reduce parsing. LR parsing is a @@ -1368,64 +1448,78 @@
-LR parsing is commonly implemented by shifting grammar symbols onto a stack and looking at the stack and the next -input token for patterns. The details of the algorithm can be found in a compiler text, but the -following example illustrates the steps that are performed if you wanted to parse the expression -3 + 5 * (10 - 20) using the grammar defined above: +LR parsing is commonly implemented by shifting grammar symbols onto a +stack and looking at the stack and the next input token for patterns that +match one of the grammar rules. +The details of the algorithm can be found in a compiler textbook, but the +following example illustrates the steps that are performed if you +wanted to parse the expression +3 + 5 * (10 - 20) using the grammar defined above. In the example, +the special symbol $ represents the end of input. +
- -When parsing the expression, an underlying state machine and the current input token determine what to do next. -If the next token looks like part of a valid grammar rule (based on other items on the stack), it is generally shifted -onto the stack. If the top of the stack contains a valid right-hand-side of a grammar rule, it is -usually "reduced" and the symbols replaced with the symbol on the left-hand-side. When this reduction occurs, the -appropriate action is triggered (if defined). If the input token can't be shifted and the top of stack doesn't match -any grammar rules, a syntax error has occurred and the parser must take some kind of recovery step (or bail out). - -Step Symbol Stack Input Tokens Action ---- --------------------- --------------------- ------------------------------- -1 $ 3 + 5 * ( 10 - 20 )$ Shift 3 -2 $ 3 + 5 * ( 10 - 20 )$ Reduce factor : NUMBER -3 $ factor + 5 * ( 10 - 20 )$ Reduce term : factor -4 $ term + 5 * ( 10 - 20 )$ Reduce expr : term -5 $ expr + 5 * ( 10 - 20 )$ Shift + -6 $ expr + 5 * ( 10 - 20 )$ Shift 5 -7 $ expr + 5 * ( 10 - 20 )$ Reduce factor : NUMBER -8 $ expr + factor * ( 10 - 20 )$ Reduce term : factor -9 $ expr + term * ( 10 - 20 )$ Shift * -10 $ expr + term * ( 10 - 20 )$ Shift ( -11 $ expr + term * ( 10 - 20 )$ Shift 10 -12 $ expr + term * ( 10 - 20 )$ Reduce factor : NUMBER -13 $ expr + term * ( factor - 20 )$ Reduce term : factor -14 $ expr + term * ( term - 20 )$ Reduce expr : term -15 $ expr + term * ( expr - 20 )$ Shift - -16 $ expr + term * ( expr - 20 )$ Shift 20 -17 $ expr + term * ( expr - 20 )$ Reduce factor : NUMBER -18 $ expr + term * ( expr - factor )$ Reduce term : factor -19 $ expr + term * ( expr - term )$ Reduce expr : expr - term -20 $ expr + term * ( expr )$ Shift ) -21 $ expr + term * ( expr ) $ Reduce factor : (expr) -22 $ expr + term * factor $ Reduce term : term * factor -23 $ expr + term $ Reduce expr : expr + term -24 $ expr $ Reduce expr -25 $ $ Success! --
-It is important to note that the underlying implementation is built around a large finite-state machine that is encoded -in a collection of tables. The construction of these tables is quite complicated and beyond the scope of this discussion. -However, subtle details of this process explain why, in the example above, the parser chooses to shift a token -onto the stack in step 9 rather than reducing the rule expr : expr + term. - -
+It is important to note that the underlying implementation is built +around a large finite-state machine that is encoded in a collection of +tables. The construction of these tables is non-trivial and +beyond the scope of this discussion. However, subtle details of this +process explain why, in the example above, the parser chooses to shift +a token onto the stack in step 9 rather than reducing the +rule expr : expr + term. + +
-For tokens, the "value" of the corresponding p[i] is the -same as the p.value attribute assigned -in the lexer module. For non-terminals, the value is determined by -whatever is placed in p[0] when rules are reduced. This -value can be anything at all. However, it probably most common for -the value to be a simple Python type, a tuple, or an instance. In this example, we -are relying on the fact that the NUMBER token stores an integer value in its value -field. All of the other rules simply perform various types of integer operations and store -the result. - -@@ -1507,42 +1601,49 @@5.1 An example
-Note: The use of negative indices have a special meaning in yacc---specially p[-1] does -not have the same value as p[3] in this example. Please see the section on "Embedded Actions" for further -details. -
-The first rule defined in the yacc specification determines the starting grammar -symbol (in this case, a rule for expression appears first). Whenever -the starting rule is reduced by the parser and no more input is available, parsing -stops and the final value is returned (this value will be whatever the top-most rule -placed in p[0]). Note: an alternative starting symbol can be specified using the start keyword argument to +For tokens, the "value" of the corresponding p[i] is the +same as the p.value attribute assigned in the lexer +module. For non-terminals, the value is determined by whatever is +placed in p[0] when rules are reduced. This value can be +anything at all. However, it probably most common for the value to be +a simple Python type, a tuple, or an instance. In this example, we +are relying on the fact that the NUMBER token stores an +integer value in its value field. All of the other rules simply +perform various types of integer operations and propagate the result. +
+ ++Note: The use of negative indices have a special meaning in +yacc---specially p[-1] does not have the same value +as p[3] in this example. Please see the section on "Embedded +Actions" for further details. +
+ ++The first rule defined in the yacc specification determines the +starting grammar symbol (in this case, a rule for expression +appears first). Whenever the starting rule is reduced by the parser +and no more input is available, parsing stops and the final value is +returned (this value will be whatever the top-most rule placed +in p[0]). Note: an alternative starting symbol can be +specified using the start keyword argument to yacc(). -
The p_error(p) rule is defined to catch syntax errors. See the error handling section -below for more detail. +
The p_error(p) rule is defined to catch syntax errors. +See the error handling section below for more detail.
-To build the parser, call the yacc.yacc() function. This function -looks at the module and attempts to construct all of the LR parsing tables for the grammar -you have specified. The first time yacc.yacc() is invoked, you will get a message -such as this: +To build the parser, call the yacc.yacc() function. This +function looks at the module and attempts to construct all of the LR +parsing tables for the grammar you have specified. The first +time yacc.yacc() is invoked, you will get a message such as +this:
@@ -1554,7 +1655,8 @@$ python calcparse.py -yacc: Generating LALR parsing table... +Generating LALR tables calc >
If any errors are detected in your grammar specification, yacc.py will produce @@ -1569,7 +1671,16 @@
+The final part of the example shows how to actually run the parser +created by +yacc(). To run the parser, you simply have to call +the parse() with a string of input text. This will run all +of the grammar rules and return the result of the entire parse. This +result return is the value assigned to p[0] in the starting +grammar rule.
-This declaration specifies that PLUS/MINUS have -the same precedence level and are left-associative and that -TIMES/DIVIDE have the same precedence and are left-associative. -Within the precedence declaration, tokens are ordered from lowest to highest precedence. Thus, -this declaration specifies that TIMES/DIVIDE have higher -precedence than PLUS/MINUS (since they appear later in the +This declaration specifies that PLUS/MINUS have the +same precedence level and are left-associative and that +TIMES/DIVIDE have the same precedence and are +left-associative. Within the precedence declaration, tokens +are ordered from lowest to highest precedence. Thus, this declaration +specifies that TIMES/DIVIDE have higher precedence +than PLUS/MINUS (since they appear later in the precedence specification).@@ -1735,9 +1856,11 @@5.5 Changing the starting symbol
5.6 Dealing With Ambiguous Grammars
-The expression grammar given in the earlier example has been written in a special format to eliminate ambiguity. -However, in many situations, it is extremely difficult or awkward to write grammars in this format. A -much more natural way to express the grammar is in a more compact form like this: +The expression grammar given in the earlier example has been written +in a special format to eliminate ambiguity. However, in many +situations, it is extremely difficult or awkward to write grammars in +this format. A much more natural way to express the grammar is in a +more compact form like this:-Unfortunately, this grammar specification is ambiguous. For example, if you are parsing the string -"3 * 4 + 5", there is no way to tell how the operators are supposed to be grouped. -For example, does the expression mean "(3 * 4) + 5" or is it "3 * (4+5)"? +Unfortunately, this grammar specification is ambiguous. For example, +if you are parsing the string "3 * 4 + 5", there is no way to tell how +the operators are supposed to be grouped. For example, does the +expression mean "(3 * 4) + 5" or is it "3 * (4+5)"?@@ -1750,15 +1873,18 @@5.6 Dealing With Ambiguous Grammars
-When an ambiguous grammar is given to yacc.py it will print messages about "shift/reduce conflicts" -or a "reduce/reduce conflicts". A shift/reduce conflict is caused when the parser generator can't decide -whether or not to reduce a rule or shift a symbol on the parsing stack. For example, consider -the string "3 * 4 + 5" and the internal parsing stack: +When an ambiguous grammar is given to yacc.py it will print +messages about "shift/reduce conflicts" or "reduce/reduce conflicts". +A shift/reduce conflict is caused when the parser generator can't +decide whether or not to reduce a rule or shift a symbol on the +parsing stack. For example, consider the string "3 * 4 + 5" and the +internal parsing stack:
-In this case, when the parser reaches step 6, it has two options. One is to reduce the -rule expr : expr * expr on the stack. The other option is to shift the -token + on the stack. Both options are perfectly legal from the rules -of the context-free-grammar. +In this case, when the parser reaches step 6, it has two options. One +is to reduce the rule expr : expr * expr on the stack. The +other option is to shift the token + on the stack. Both +options are perfectly legal from the rules of the +context-free-grammar.@@ -1773,20 +1899,25 @@5.6 Dealing With Ambiguous Grammars
-By default, all shift/reduce conflicts are resolved in favor of shifting. Therefore, in the above -example, the parser will always shift the + instead of reducing. Although this -strategy works in many cases (including the ambiguous if-then-else), it is not enough for arithmetic -expressions. In fact, in the above example, the decision to shift + is completely wrong---we should have -reduced expr * expr since multiplication has higher mathematical precedence than addition. +By default, all shift/reduce conflicts are resolved in favor of +shifting. Therefore, in the above example, the parser will always +shift the + instead of reducing. Although this strategy +works in many cases (for example, the case of +"if-then" versus "if-then-else"), it is not enough for arithmetic expressions. In fact, +in the above example, the decision to shift + is completely +wrong---we should have reduced expr * expr since +multiplication has higher mathematical precedence than addition. -
To resolve ambiguity, especially in expression grammars, yacc.py allows individual -tokens to be assigned a precedence level and associativity. This is done by adding a variable +
To resolve ambiguity, especially in expression +grammars, yacc.py allows individual tokens to be assigned a +precedence level and associativity. This is done by adding a variable precedence to the grammar file like this:
@@ -1798,17 +1929,19 @@5.6 Dealing With Ambiguous Grammars
-The precedence specification works by associating a numerical precedence level value and associativity direction to -the listed tokens. For example, in the above example you get: +The precedence specification works by associating a numerical +precedence level value and associativity direction to the listed +tokens. For example, in the above example you get:
-These values are then used to attach a numerical precedence value and associativity direction -to each grammar rule. This is always determined by looking at the precedence of the right-most terminal symbol. -For example: +These values are then used to attach a numerical precedence value and +associativity direction to each grammar rule. This is always +determined by looking at the precedence of the right-most terminal +symbol. For example:@@ -1819,9 +1952,10 @@5.6 Dealing With Ambiguous Grammars
-As an optional feature, yacc.py can automatically track line numbers and positions for all of the grammar symbols -as well. However, this -extra tracking requires extra processing and can significantly slow down parsing. Therefore, it must be enabled by passing the +As an optional feature, yacc.py can automatically track line +numbers and positions for all of the grammar symbols as well. +However, this extra tracking requires extra processing and can +significantly slow down parsing. Therefore, it must be enabled by +passing the tracking=True option to yacc.parse(). For example:@@ -1839,7 +1973,7 @@5.6 Dealing With Ambiguous Grammars
-
-For example, if "expression PLUS expression" has been parsed and the next token -is "TIMES", the action is going to be a shift because "TIMES" has a higher precedence level than "PLUS". On the other -hand, if "expression TIMES expression" has been parsed and the next token is "PLUS", the action -is going to be reduce because "PLUS" has a lower precedence than "TIMES." +For example, if "expression PLUS expression" has been parsed and the +next token is "TIMES", the action is going to be a shift because +"TIMES" has a higher precedence level than "PLUS". On the other hand, +if "expression TIMES expression" has been parsed and the next token is +"PLUS", the action is going to be reduce because "PLUS" has a lower +precedence than "TIMES."- If the current token has higher precedence, it is shifted. +
- If the current token has higher precedence than the rule on the stack, it is shifted.
- If the grammar rule on the stack has higher precedence, the rule is reduced.
- If the current token and the grammar rule have the same precedence, the rule is reduced for left associativity, whereas the token is shifted for right associativity. @@ -1847,21 +1981,28 @@
5.6 Dealing With Ambiguous Grammars
favor of shifting (the default).-When shift/reduce conflicts are resolved using the first three techniques (with the help of -precedence rules), yacc.py will report no errors or conflicts in the grammar. +When shift/reduce conflicts are resolved using the first three +techniques (with the help of precedence rules), yacc.py will +report no errors or conflicts in the grammar (although it will print +some information in the parser.out debugging file).
-One problem with the precedence specifier technique is that it is sometimes necessary to -change the precedence of an operator in certain contents. For example, consider a unary-minus operator -in "3 + 4 * -5". Normally, unary minus has a very high precedence--being evaluated before the multiply. -However, in our precedence specifier, MINUS has a lower precedence than TIMES. To deal with this, -precedence rules can be given for fictitious tokens like this: +One problem with the precedence specifier technique is that it is +sometimes necessary to change the precedence of an operator in certain +contexts. For example, consider a unary-minus operator in "3 + 4 * +-5". Mathematically, the unary minus is normally given a very high +precedence--being evaluated before the multiply. However, in our +precedence specifier, MINUS has a lower precedence than TIMES. To +deal with this, precedence rules can be given for so-called "fictitious tokens" +like this:
-In the file, each state of the grammar is described. Within each state the "." indicates the current -location of the parse within any applicable grammar rules. In addition, the actions for each valid -input token are listed. When a shift/reduce or reduce/reduce conflict arises, rules not selected -are prefixed with an !. For example: +The different states that appear in this file are a representation of +every possible sequence of valid input tokens allowed by the grammar. +When receiving input tokens, the parser is building up a stack and +looking for matching rules. Each state keeps track of the grammar +rules that might be in the process of being matched at that point. Within each +rule, the "." character indicates the current location of the parse +within that rule. In addition, the actions for each valid input token +are listed. When a shift/reduce or reduce/reduce conflict arises, +rules not selected are prefixed with an !. For example:@@ -1950,9 +2091,25 @@5.6 Dealing With Ambiguous Grammars
the rule assignment : ID EQUALS expression.-It should be noted that reduce/reduce conflicts are notoriously difficult to spot -simply looking at the input grammer. To locate these, it is usually easier to look at the -parser.out debugging file with an appropriately high level of caffeination. +It should be noted that reduce/reduce conflicts are notoriously +difficult to spot simply looking at the input grammer. When a +reduce/reduce conflict occurs, yacc() will try to help by +printing a warning message such as this: + +
++ +This message identifies the two rules that are in conflict. However, +it may not tell you how the parser arrived at such a state. To try +and figure it out, you'll probably have to look at your grammar and +the contents of the +parser.out debugging file with an appropriately high level of +caffeination.+WARNING: 1 reduce/reduce conflict +WARNING: reduce/reduce conflict in state 15 resolved using rule (assignment -> ID EQUALS NUMBER) +WARNING: rejected rule (expression -> NUMBER) ++5.7 The parser.out file
@@ -2212,10 +2369,15 @@5.7 The parser.out file
@@ -2438,8 +2609,9 @@@@ -2232,10 +2394,19 @@5.7 The parser.out file
5.8 Syntax Error Handling
+If you are creating a parser for production use, the handling of +syntax errors is important. As a general rule, you don't want a +parser to simply throw up its hands and stop at the first sign of +trouble. Instead, you want it to report the error, recover if possible, and +continue parsing so that all of the errors in the input get reported +to the user at once. This is the standard behavior found in compilers +for languages such as C, C++, and Java. -When a syntax error occurs during parsing, the error is immediately +In PLY, when a syntax error occurs during parsing, the error is immediately detected (i.e., the parser does not read any more tokens beyond the -source of the error). Error recovery in LR parsers is a delicate +source of the error). However, at this point, the parser enters a +recovery mode that can be used to try and continue further parsing. +As a general rule, error recovery in LR parsers is a delicate topic that involves ancient rituals and black-magic. The recovery mechanism provided by yacc.py is comparable to Unix yacc so you may want consult a book like O'Reilly's "Lex and Yacc" for some of the finer details. @@ -2407,7 +2578,7 @@5.8.3 Signaling an error from a production
def p_production(p): 'production : some production ...' - raise yacc.SyntaxError + raise SyntaxError5.8.4 General comments on error handling
5.9 Line Number and Position Tracking
-Position tracking is often a tricky problem when writing compilers. By default, PLY tracks the line number and position of -all tokens. This information is available using the following functions: +Position tracking is often a tricky problem when writing compilers. +By default, PLY tracks the line number and position of all tokens. +This information is available using the following functions:
@@ -2468,8 +2642,9 @@-Once enabled, the lineno() and lexpos() methods work for all grammar symbols. In addition, two -additional methods can be used: +Once enabled, the lineno() and lexpos() methods work +for all grammar symbols. In addition, two additional methods can be +used:5.9 Line Number and Position Tracking
-Similarly, you may get better parsing performance if you only propagate line number -information where it's needed. For example: +Similarly, you may get better parsing performance if you only +selectively propagate line number information where it's needed using +the p.set_lineno() method. For example:
-Finally, it should be noted that PLY does not store position information after a rule has been -processed. If it is important for you to retain this information in an abstract syntax tree, you -must make your own copy. +PLY doesn't retain line number information from rules that have already been +parsed. If you are building an abstract syntax tree and need to have line numbers, +you should make sure that the line numbers appear in the tree itself.def p_fname(p): 'fname : ID' - p[0] = (p[1],p.lineno(1)) + p[0] = p[1] + p.set_lineno(0,p.lineno(1))
A minimal way to construct a tree is to simply create and +propagate a tuple or list in each grammar rule function. There +are many possible ways to do this, but one example would be something +like this: -For example: +
++ ++def p_expression_binop(p): + '''expression : expression PLUS expression + | expression MINUS expression + | expression TIMES expression + | expression DIVIDE expression''' + + p[0] = ('binary-expression',p[2],p[1],p[3]) + +def p_expression_group(p): + 'expression : LPAREN expression RPAREN' + p[0] = ('group-expression',p[2]) + +def p_expression_number(p): + 'expression : NUMBER' + p[0] = ('number-expression',p[1]) ++
+Another approach is to create a set of data structure for different +kinds of abstract syntax tree nodes and assign nodes to p[0] +in each rule. For example:
-To simplify tree traversal, it may make sense to pick a very generic tree structure for your parse tree nodes. -For example: +The advantage to this approach is that it may make it easier to attach more complicated +semantics, type checking, code generation, and other features to the node classes. + +@@ -2569,8 +2773,12 @@5.10 AST Construction
+To simplify tree traversal, it may make sense to pick a very generic +tree structure for your parse tree nodes. For example:
-In this case, the embedded action new_scope executes immediately after a LBRACE ({) symbol is parsed. This might -adjust internal symbol tables and other aspects of the parser. Upon completion of the rule statements_block, code might undo the operations performed in the embedded action (e.g., pop_scope()). +In this case, the embedded action new_scope executes +immediately after a LBRACE ({) symbol is parsed. +This might adjust internal symbol tables and other aspects of the +parser. Upon completion of the rule statements_block, code +might undo the operations performed in the embedded action +(e.g., pop_scope()). -@@ -2613,7 +2821,7 @@5.11 Embedded Actions
parsed. Sometimes, however, it is useful to execute small code fragments during intermediate stages of parsing. For example, suppose you wanted to perform some action immediately after A has -been parsed. To do this, you can write a empty rule like this: +been parsed. To do this, write an empty rule like this:-an extra shift-reduce conflict will be introduced. This conflict is caused by the fact that the same symbol C appears next in -both the abcd and abcx rules. The parser can either shift the symbol (abcd rule) or reduce the empty rule seen_AB (abcx rule). +an extra shift-reduce conflict will be introduced. This conflict is +caused by the fact that the same symbol C appears next in +both the abcd and abcx rules. The parser can either +shift the symbol (abcd rule) or reduce the empty +rule seen_AB (abcx rule).@@ -2676,8 +2884,11 @@5.11 Embedded Actions
A common use of embedded rules is to control other aspects of parsing @@ -2701,10 +2912,14 @@
5.11 Embedded Actions
-To do this, it is important to note that both the lexer and parser are -actually implemented as objects. These objects are returned by the -lex() and yacc() functions respectively. For example: +As a general rules this isn't a problem. However, to make it work, +you need to carefully make sure everything gets hooked up correctly. +First, make sure you save the objects returned by lex() and +yacc(). For example:
-To attach the lexer and parser together, make sure you use the lexer argumemnt to parse. For example: +Next, when parsing, make sure you give the parse() function a reference to the lexer it +should be using. For example:@@ -2836,7 +3051,8 @@6. Parser and Lexer State Management
-Within lexer and parser rules, these objects are also available. In the lexer, -the "lexer" attribute of a token refers to the lexer object in use. For example: +If you forget to do this, the parser will use the last lexer +created--which is not always what you want. + +@@ -2844,8 +3060,13 @@6. Parser and Lexer State Management
+Within lexer and parser rule functions, these objects are also +available. In the lexer, the "lexer" attribute of a token refers to +the lexer object that triggered the rule. For example:
diff --git a/example/BASIC/basiclog.py b/example/BASIC/basiclog.py new file mode 100644 index 0000000..ccfd7b9 --- /dev/null +++ b/example/BASIC/basiclog.py @@ -0,0 +1,79 @@ +# An implementation of Dartmouth BASIC (1964) +# + +import sys +sys.path.insert(0,"../..") + +if sys.version_info[0] >= 3: + raw_input = input + +import logging +logging.basicConfig( + level = logging.INFO, + filename = "parselog.txt", + filemode = "w" +) +log = logging.getLogger() + +import basiclex +import basparse +import basinterp + +# If a filename has been specified, we try to run it. +# If a runtime error occurs, we bail out and enter +# interactive mode below +if len(sys.argv) == 2: + data = open(sys.argv[1]).read() + prog = basparse.parse(data,debug=log) + if not prog: raise SystemExit + b = basinterp.BasicInterpreter(prog) + try: + b.run() + raise SystemExit + except RuntimeError: + pass + +else: + b = basinterp.BasicInterpreter({}) + +# Interactive mode. This incrementally adds/deletes statements +# from the program stored in the BasicInterpreter object. In +# addition, special commands 'NEW','LIST',and 'RUN' are added. +# Specifying a line number with no code deletes that line from +# the program. + +while 1: + try: + line = raw_input("[BASIC] ") + except EOFError: + raise SystemExit + if not line: continue + line += "\n" + prog = basparse.parse(line,debug=log) + if not prog: continue + + keys = list(prog) + if keys[0] > 0: + b.add_statements(prog) + else: + stat = prog[keys[0]] + if stat[0] == 'RUN': + try: + b.run() + except RuntimeError: + pass + elif stat[0] == 'LIST': + b.list() + elif stat[0] == 'BLANK': + b.del_line(stat[1]) + elif stat[0] == 'NEW': + b.new() + + + + + + + + + diff --git a/example/BASIC/basparse.py b/example/BASIC/basparse.py index d773715..ccdeb16 100644 --- a/example/BASIC/basparse.py +++ b/example/BASIC/basparse.py @@ -403,9 +403,9 @@ def p_error(p): bparser = yacc.yacc() -def parse(data): +def parse(data,debug=0): bparser.error = 0 - p = bparser.parse(data) + p = bparser.parse(data,debug=debug) if bparser.error: return None return p diff --git a/example/calc/calc.py b/example/calc/calc.py index 2e36c7d..b923780 100644 --- a/example/calc/calc.py +++ b/example/calc/calc.py @@ -23,11 +23,7 @@ def t_NUMBER(t): r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 + t.value = int(t.value) return t t_ignore = " \t" diff --git a/example/calcdebug/calc.py b/example/calcdebug/calc.py new file mode 100644 index 0000000..6732f9f --- /dev/null +++ b/example/calcdebug/calc.py @@ -0,0 +1,113 @@ +# ----------------------------------------------------------------------------- +# calc.py +# +# This example shows how to run the parser in a debugging mode +# with output routed to a logging object. +# ----------------------------------------------------------------------------- + +import sys +sys.path.insert(0,"../..") + +if sys.version_info[0] >= 3: + raw_input = input + +tokens = ( + 'NAME','NUMBER', + ) + +literals = ['=','+','-','*','/', '(',')'] + +# Tokens + +t_NAME = r'[a-zA-Z_][a-zA-Z0-9_]*' + +def t_NUMBER(t): + r'\d+' + t.value = int(t.value) + return t + +t_ignore = " \t" + +def t_newline(t): + r'\n+' + t.lexer.lineno += t.value.count("\n") + +def t_error(t): + print("Illegal character '%s'" % t.value[0]) + t.lexer.skip(1) + +# Build the lexer +import ply.lex as lex +lex.lex() + +# Parsing rules + +precedence = ( + ('left','+','-'), + ('left','*','/'), + ('right','UMINUS'), + ) + +# dictionary of names +names = { } + +def p_statement_assign(p): + 'statement : NAME "=" expression' + names[p[1]] = p[3] + +def p_statement_expr(p): + 'statement : expression' + print(p[1]) + +def p_expression_binop(p): + '''expression : expression '+' expression + | expression '-' expression + | expression '*' expression + | expression '/' expression''' + if p[2] == '+' : p[0] = p[1] + p[3] + elif p[2] == '-': p[0] = p[1] - p[3] + elif p[2] == '*': p[0] = p[1] * p[3] + elif p[2] == '/': p[0] = p[1] / p[3] + +def p_expression_uminus(p): + "expression : '-' expression %prec UMINUS" + p[0] = -p[2] + +def p_expression_group(p): + "expression : '(' expression ')'" + p[0] = p[2] + +def p_expression_number(p): + "expression : NUMBER" + p[0] = p[1] + +def p_expression_name(p): + "expression : NAME" + try: + p[0] = names[p[1]] + except LookupError: + print("Undefined name '%s'" % p[1]) + p[0] = 0 + +def p_error(p): + if p: + print("Syntax error at '%s'" % p.value) + else: + print("Syntax error at EOF") + +import ply.yacc as yacc +yacc.yacc() + +import logging +logging.basicConfig( + level=logging.INFO, + filename="parselog.txt" +) + +while 1: + try: + s = raw_input('calc > ') + except EOFError: + break + if not s: continue + yacc.parse(s,debug=logging.getLogger()) diff --git a/example/closurecalc/calc.py b/example/closurecalc/calc.py index a1d5845..6598f58 100644 --- a/example/closurecalc/calc.py +++ b/example/closurecalc/calc.py @@ -36,11 +36,7 @@ def make_calculator(): def t_NUMBER(t): r'\d+' - try: - t.value = int(t.value) - except ValueError: - print("Integer value too large %s" % t.value) - t.value = 0 + t.value = int(t.value) return t def t_newline(t): diff --git a/example/optcalc/README b/example/optcalc/README index 6d196f0..53dd5fc 100644 --- a/example/optcalc/README +++ b/example/optcalc/README @@ -5,5 +5,5 @@ To run: - Then run 'python -OO calc.py' -If working corretly, the second version should run the +If working correctly, the second version should run the same way. diff --git a/ply/lex.py b/ply/lex.py index b9a478d..71b33b5 100644 --- a/ply/lex.py +++ b/ply/lex.py @@ -3,7 +3,7 @@ # # Author: David M. Beazley (dave@dabeaz.com) # -# Copyright (C) 2001-2008, David M. Beazley +# Copyright (C) 2001-2009, David M. Beazley # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -22,12 +22,12 @@ # See the file COPYING for a complete copy of the LGPL. # ----------------------------------------------------------------------------- -__version__ = "2.6" -__tabversion__ = "2.4" # Version of table file used +__version__ = "3.0" +__tabversion__ = "3.0" # Version of table file used import re, sys, types, copy, os -# This tuple lists known string types +# This tuple contains known string types try: # Python 2.6 StringTypes = (types.StringType, types.UnicodeType) @@ -35,7 +35,9 @@ # Python 3.0 StringTypes = (str, bytes) -# Compatibility function for python 2.6/3.0 +# Extract the code attribute of a function. Different implementations +# are for Python 2/3 compatibility. + if sys.version_info[0] < 3: def func_code(f): return f.func_code @@ -54,27 +56,12 @@ def __init__(self,message,s): self.args = (message,) self.text = s -# An object used to issue one-time warning messages for various features - -class LexWarning(object): - def __init__(self): - self.warned = 0 - def __call__(self,msg): - if not self.warned: - sys.stderr.write("ply.lex: Warning: " + msg+"\n") - self.warned = 1 - -_SkipWarning = LexWarning() # Warning for use of t.skip() on tokens - # Token class. This class is used to represent the tokens produced. class LexToken(object): def __str__(self): return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos) def __repr__(self): return str(self) - def skip(self,n): - self.lexer.skip(n) - _SkipWarning("Calling t.skip() on a token is deprecated. Please use t.lexer.skip()") # ----------------------------------------------------------------------------- # Lexer class @@ -372,6 +359,19 @@ def token(self): raise RuntimeError("No input string given with input()") return None + # Iterator interface + def __iter__(self): + return self + + def next(self): + t = self.token() + if t is None: + raise StopIteration + return t + + __next__ = next + + # ----------------------------------------------------------------------------- # _validate_file() # @@ -891,7 +891,6 @@ def runmain(lexer=None,data=None): if not tok: break sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno,tok.lexpos)) - # ----------------------------------------------------------------------------- # @TOKEN(regex) # diff --git a/ply/yacc.py b/ply/yacc.py index 52ac7ef..f660f44 100644 --- a/ply/yacc.py +++ b/ply/yacc.py @@ -1,9 +1,9 @@ -#----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # ply: yacc.py # # Author(s): David M. Beazley (dave@dabeaz.com) # -# Copyright (C) 2001-2008, David M. Beazley +# Copyright (C) 2001-2009, David M. Beazley # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -50,8 +50,8 @@ # own risk! # ---------------------------------------------------------------------------- -__version__ = "2.6" -__tabversion__ = "2.4" # Table version +__version__ = "3.0" +__tabversion__ = "3.0" # Table version #----------------------------------------------------------------------------- # === User configurable parameters === @@ -73,28 +73,6 @@ import re, types, sys, os.path -# Python 2.6/3.0 compatibility -try: - import cStringIO -except ImportError: - import io as cStringIO - -# Python 2.6/3.0 compatibility function. Create a new MD5 object for computing -# the grammar signature. - -def md5_new(): - try: - import hashlib - return hashlib.md5() - except ImportError: - import md5 - return md5.new() - -# Python 2.6/3.0 compatibility function. Update the MD5 signature -# using UTF-8 encoded data. -def Signature_update(data): - Signature.update(data.encode('utf-8')) - # Compatibility function for python 2.6/3.0 if sys.version_info[0] < 3: def func_code(f): @@ -114,17 +92,40 @@ def load_ply_lex(): if sys.version_info[0] < 3: import lex else: - env = { } - exec("from . import lex", env, env) - lex = env['lex'] + import ply.lex as lex return lex +# This object is a stand-in for a logging object created by the +# logging module. PLY will use this by default to create things +# such as the parser.out file. If a user wants more detailed +# information, they can create their own logging object and pass +# it into PLY. + +class PlyLogger(object): + def __init__(self,f): + self.f = f + def debug(self,msg,*args,**kwargs): + self.f.write((msg % args) + "\n") + info = debug + + def warning(self,msg,*args,**kwargs): + self.f.write("WARNING: "+ (msg % args) + "\n") + + def error(self,msg,*args,**kwargs): + self.f.write("ERROR: " + (msg % args) + "\n") + + critical = debug + +# Null logger is used when no output is generated. Does nothing. +class NullLogger(object): + def __getattribute__(self,name): + return self + def __call__(self,*args,**kwargs): + return self + # Exception raised for yacc-related errors class YaccError(Exception): pass -# Exception raised for errors raised in production rules -class SyntaxError(Exception): pass - #----------------------------------------------------------------------------- # === LR Parsing Engine === # @@ -177,6 +178,9 @@ def __len__(self): def lineno(self,n): return getattr(self.slice[n],"lineno",0) + def set_lineno(self,n,lineno): + self.slice[n].lineno = n + def linespan(self,n): startline = getattr(self.slice[n],"lineno",0) endline = getattr(self.slice[n],"endlineno",startline) @@ -194,27 +198,18 @@ def error(self): raise SyntaxError -# The LR Parsing engine. This is defined as a class so that multiple parsers -# can exist in the same process. A user never instantiates this directly. -# Instead, the global yacc() function should be used to create a suitable Parser -# object. - -class Parser: - def __init__(self,magic=None): - - # This is a hack to keep users from trying to instantiate a Parser - # object directly. - - if magic != "xyzzy": - raise YaccError("Can't directly instantiate Parser. Use yacc() instead.") +# ----------------------------------------------------------------------------- +# == LRParser == +# +# The LR Parsing engine. +# ----------------------------------------------------------------------------- - # Reset internal state - self.productions = None # List of productions - self.errorfunc = None # Error handling function - self.action = { } # LR Action table - self.goto = { } # LR goto table - self.require = { } # Attribute require table - self.method = "Unknown LR" # Table construction method used +class LRParser: + def __init__(self,lrtab,errorf): + self.productions = lrtab.lr_productions + self.action = lrtab.lr_action + self.goto = lrtab.lr_goto + self.errorfunc = errorf def errok(self): self.errorok = 1 @@ -229,6 +224,8 @@ def restart(self): def parse(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): if debug or yaccdevel: + if isinstance(debug,int): + debug = PlyLogger(sys.stderr) return self.parsedebug(input,lexer,debug,tracking,tokenfunc) elif tracking: return self.parseopt(input,lexer,debug,tracking,tokenfunc) @@ -250,7 +247,7 @@ def parse(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): # # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - def parsedebug(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): + def parsedebug(self,input=None,lexer=None,debug=None,tracking=0,tokenfunc=None): lookahead = None # Current lookahead symbol lookaheadstack = [ ] # Stack of lookahead symbols actions = self.action # Local reference to action table (to avoid lookup on self.) @@ -259,6 +256,10 @@ def parsedebug(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): pslice = YaccProduction(None) # Production object passed to grammar rules errorcount = 0 # Used during error recovery + # --! DEBUG + debug.info("PLY: PARSE DEBUG START") + # --! DEBUG + # If no lexer was given, we will try to use the lex module if not lexer: lex = load_ply_lex() @@ -301,8 +302,8 @@ def parsedebug(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): # the next token off of the lookaheadstack or from the lexer # --! DEBUG - if debug > 1: - sys.stdout.write('state %s\n' % state) + debug.debug('') + debug.debug('State : %s', state) # --! DEBUG if not lookahead: @@ -315,32 +316,22 @@ def parsedebug(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): lookahead.type = "$end" # --! DEBUG - if debug: - errorlead = ("%s . %s" % (" ".join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip() + debug.debug('Stack : %s', + ("%s . %s" % (" ".join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()) # --! DEBUG # Check the action table ltype = lookahead.type t = actions[state].get(ltype) - # --! DEBUG - if debug > 1: - sys.stdout.write('action %s\n' % t) - # --! DEBUG - if t is not None: if t > 0: # shift a symbol on the stack - if ltype == "$end": - # Error, end of input - sys.stderr.write("yacc: Parse error. EOF\n") - return statestack.append(t) state = t # --! DEBUG - if debug > 1: - sys.stderr.write("%-60s shift state %s\n" % (errorlead, t)) + debug.debug("Action : Shift and goto state %s", t) # --! DEBUG symstack.append(lookahead) @@ -362,8 +353,11 @@ def parsedebug(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): sym.value = None # --! DEBUG - if debug > 1: - sys.stderr.write("%-60s reduce %d\n" % (errorlead, -t)) + if plen: + debug.info("Action : Reduce rule [%s] with %s and goto state %d", p.str, [_v.value for _v in symstack[-plen:]],-t) + else: + debug.info("Action : Reduce rule [%s] with %s and goto state %d", p.str, [],-t) + # --! DEBUG if plen: @@ -392,7 +386,10 @@ def parsedebug(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): # Call the grammar rule with our special slice object del symstack[-plen:] del statestack[-plen:] - p.func(pslice) + p.callable(pslice) + # --! DEBUG + debug.info("Result : %r", pslice[0]) + # --! DEBUG symstack.append(sym) state = goto[statestack[-1]][pname] statestack.append(state) @@ -428,7 +425,10 @@ def parsedebug(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): try: # Call the grammar rule with our special slice object - p.func(pslice) + p.callable(pslice) + # --! DEBUG + debug.info("Result : %r", pslice[0]) + # --! DEBUG symstack.append(sym) state = goto[statestack[-1]][pname] statestack.append(state) @@ -447,13 +447,18 @@ def parsedebug(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): if t == 0: n = symstack[-1] - return getattr(n,"value",None) + result = getattr(n,"value",None) + # --! DEBUG + debug.info("Done : Returning %r", result) + debug.info("PLY: PARSE DEBUG END") + # --! DEBUG + return result if t == None: # --! DEBUG - if debug: - sys.stderr.write(errorlead + "\n") + debug.error('Error : %s', + ("%s . %s" % (" ".join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip()) # --! DEBUG # We have some kind of parsing error here. To handle @@ -621,10 +626,6 @@ def parseopt(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): if t is not None: if t > 0: # shift a symbol on the stack - if ltype == '$end': - # Error, end of input - sys.stderr.write("yacc: Parse error. EOF\n") - return statestack.append(t) state = t @@ -672,7 +673,7 @@ def parseopt(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): # Call the grammar rule with our special slice object del symstack[-plen:] del statestack[-plen:] - p.func(pslice) + p.callable(pslice) symstack.append(sym) state = goto[statestack[-1]][pname] statestack.append(state) @@ -708,7 +709,7 @@ def parseopt(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=None): try: # Call the grammar rule with our special slice object - p.func(pslice) + p.callable(pslice) symstack.append(sym) state = goto[statestack[-1]][pname] statestack.append(state) @@ -895,10 +896,6 @@ def parseopt_notrack(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=Non if t is not None: if t > 0: # shift a symbol on the stack - if ltype == '$end': - # Error, end of input - sys.stderr.write("yacc: Parse error. EOF\n") - return statestack.append(t) state = t @@ -935,7 +932,7 @@ def parseopt_notrack(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=Non # Call the grammar rule with our special slice object del symstack[-plen:] del statestack[-plen:] - p.func(pslice) + p.callable(pslice) symstack.append(sym) state = goto[statestack[-1]][pname] statestack.append(state) @@ -965,7 +962,7 @@ def parseopt_notrack(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=Non try: # Call the grammar rule with our special slice object - p.func(pslice) + p.callable(pslice) symstack.append(sym) state = goto[statestack[-1]][pname] statestack.append(state) @@ -1078,167 +1075,169 @@ def parseopt_notrack(self,input=None,lexer=None,debug=0,tracking=0,tokenfunc=Non # Call an error function here raise RuntimeError("yacc: internal parser error!!!\n") - -# ----------------------------------------------------------------------------- -# === Parser Construction === -# -# The following functions and variables are used to implement the yacc() function -# itself. This is pretty hairy stuff involving lots of error checking, -# construction of LR items, kernels, and so forth. Although a lot of -# this work is done using global variables, the resulting Parser object -# is completely self contained--meaning that it is safe to repeatedly -# call yacc() with different grammars in the same application. -# ----------------------------------------------------------------------------- - # ----------------------------------------------------------------------------- -# validate_file() +# === Grammar Representation === # -# This function checks to see if there are duplicated p_rulename() functions -# in the parser module file. Without this function, it is really easy for -# users to make mistakes by cutting and pasting code fragments (and it's a real -# bugger to try and figure out why the resulting parser doesn't work). Therefore, -# we just do a little regular expression pattern matching of def statements -# to try and detect duplicates. +# The following functions, classes, and variables are used to represent and +# manipulate the rules that make up a grammar. # ----------------------------------------------------------------------------- -def validate_file(filename): - base,ext = os.path.splitext(filename) - if ext != '.py': return 1 # No idea. Assume it's okay. +import re - try: - f = open(filename) - lines = f.readlines() - f.close() - except IOError: - return 1 # Oh well - - # Match def p_funcname( - fre = re.compile(r'\s*def\s+(p_[a-zA-Z_0-9]*)\(') - counthash = { } - linen = 1 - noerror = 1 - for l in lines: - m = fre.match(l) - if m: - name = m.group(1) - prev = counthash.get(name) - if not prev: - counthash[name] = linen - else: - sys.stderr.write("%s:%d: Function %s redefined. Previously defined on line %d\n" % (filename,linen,name,prev)) - noerror = 0 - linen += 1 - return noerror - -# This function looks for functions that might be grammar rules, but which don't have the proper p_suffix. -def validate_dict(d): - for n,v in d.items(): - if n[0:2] == 'p_' and type(v) in (types.FunctionType, types.MethodType): continue - if n[0:2] == 't_': continue - - if n[0:2] == 'p_': - sys.stderr.write("yacc: Warning. '%s' not defined as a function\n" % n) - if 1 and isinstance(v,types.FunctionType) and func_code(v).co_argcount == 1: - try: - doc = v.__doc__.split(" ") - if doc[1] == ':': - sys.stderr.write("%s:%d: Warning. Possible grammar rule '%s' defined without p_ prefix.\n" % (func_code(v).co_filename, func_code(v).co_firstlineno,n)) - except Exception: - pass +# regex matching identifiers +_is_identifier = re.compile(r'^[a-zA-Z0-9_-]+$') # ----------------------------------------------------------------------------- -# === GRAMMAR FUNCTIONS === +# class Production: +# +# This class stores the raw information about a single production or grammar rule. +# A grammar rule refers to a specification such as this: +# +# expr : expr PLUS term +# +# Here are the basic attributes defined on all productions +# +# name - Name of the production. For example 'expr' +# prod - A list of symbols on the right side ['expr','PLUS','term'] +# prec - Production precedence level +# number - Production number. +# func - Function that executes on reduce +# file - File where production function is defined +# lineno - Line number where production function is defined # -# The following global variables and functions are used to store, manipulate, -# and verify the grammar rules specified by the user. +# The following attributes are defined or optional. +# +# len - Length of the production (number of symbols on right hand side) +# usyms - Set of unique symbols found in the production # ----------------------------------------------------------------------------- -# Initialize all of the global variables used during grammar construction -def initialize_vars(): - global Productions, Prodnames, Prodmap, Terminals - global Nonterminals, First, Follow, Precedence, UsedPrecedence, LRitems - global Errorfunc, Signature, Requires - - Productions = [None] # A list of all of the productions. The first - # entry is always reserved for the purpose of - # building an augmented grammar - - Prodnames = { } # A dictionary mapping the names of nonterminals to a list of all - # productions of that nonterminal. +class Production(object): + def __init__(self,number,name,prod,precedence=('right',0),func=None,file='',line=0): + self.name = name + self.prod = tuple(prod) + self.number = number + self.func = func + self.callable = None + self.file = file + self.line = line + self.prec = precedence + + # Internal settings used during table construction + + self.len = len(self.prod) # Length of the production - Prodmap = { } # A dictionary that is only used to detect duplicate - # productions. + # Create a list of unique production symbols used in the production + self.usyms = [ ] + for s in self.prod: + if s not in self.usyms: + self.usyms.append(s) - Terminals = { } # A dictionary mapping the names of terminal symbols to a - # list of the rules where they are used. + # List of all LR items for the production + self.lr_items = [] + self.lr_next = None - Nonterminals = { } # A dictionary mapping names of nonterminals to a list - # of rule numbers where they are used. + # Create a string representation + if self.prod: + self.str = "%s -> %s" % (self.name," ".join(self.prod)) + else: + self.str = "%s ->" % self.name - First = { } # A dictionary of precomputed FIRST(x) symbols + def __str__(self): + return self.str - Follow = { } # A dictionary of precomputed FOLLOW(x) symbols + def __repr__(self): + return "Production("+str(self)+")" - Precedence = { } # Precedence rules for each terminal. Contains tuples of the - # form ('right',level) or ('nonassoc', level) or ('left',level) + def __len__(self): + return len(self.prod) - UsedPrecedence = { } # Precedence rules that were actually used by the grammer. - # This is only used to provide error checking and to generate - # a warning about unused precedence rules. + def __nonzero__(self): + return 1 - LRitems = [ ] # A list of all LR items for the grammar. These are the - # productions with the "dot" like E -> E . PLUS E + def __getitem__(self,index): + return self.prod[index] + + # Return the nth lr_item from the production (or None if at the end) + def lr_item(self,n): + if n > len(self.prod): return None + p = LRItem(self,n) - Errorfunc = None # User defined error handler + # Precompute the list of productions immediately following. Hack. Remove later + try: + p.lr_after = Prodnames[p.prod[n+1]] + except (IndexError,KeyError): + p.lr_after = [] + try: + p.lr_before = p.prod[n-1] + except IndexError: + p.lr_before = None - Signature = md5_new() # Digital signature of the grammar rules, precedence - # and other information. Used to determined when a - # parsing table needs to be regenerated. + return p - Signature_update(__tabversion__) + # Bind the production function name to a callable + def bind(self,pdict): + if self.func: + self.callable = pdict[self.func] + +# This class serves as a minimal standin for Production objects when +# reading table data from files. It only contains information +# actually used by the LR parsing engine, plus some additional +# debugging information. +class MiniProduction(object): + def __init__(self,str,name,len,func,file,line): + self.name = name + self.len = len + self.func = func + self.callable = None + self.file = file + self.line = line + self.str = str + def __str__(self): + return self.str + def __repr__(self): + return "MiniProduction(%s)" % self.str - Requires = { } # Requires list + # Bind the production function name to a callable + def bind(self,pdict): + if self.func: + self.callable = pdict[self.func] - # File objects used when creating the parser.out debugging file - global _vf, _vfc - _vf = cStringIO.StringIO() - _vfc = cStringIO.StringIO() # ----------------------------------------------------------------------------- -# class Production: +# class LRItem # -# This class stores the raw information about a single production or grammar rule. -# It has a few required attributes: +# This class represents a specific stage of parsing a production rule. For +# example: # -# name - Name of the production (nonterminal) -# prod - A list of symbols making up its production -# number - Production number. +# expr : expr . PLUS term # -# In addition, a few additional attributes are used to help with debugging or -# optimization of table generation. +# In the above, the "." represents the current location of the parse. Here +# basic attributes: # -# file - File where production action is defined. -# lineno - Line number where action is defined -# func - Action function -# prec - Precedence level -# lr_next - Next LR item. Example, if we are ' E -> E . PLUS E' -# then lr_next refers to 'E -> E PLUS . E' -# lr_index - LR item index (location of the ".") in the prod list. +# name - Name of the production. For example 'expr' +# prod - A list of symbols on the right side ['expr','.', 'PLUS','term'] +# number - Production number. +# +# lr_next Next LR item. Example, if we are ' expr -> expr . PLUS term' +# then lr_next refers to 'expr -> expr PLUS . term' +# lr_index - LR item index (location of the ".") in the prod list. # lookaheads - LALR lookahead symbols for this item -# len - Length of the production (number of symbols on right hand side) +# len - Length of the production (number of symbols on right hand side) +# lr_after - List of all productions that immediately follow +# lr_before - Grammar symbol immediately before # ----------------------------------------------------------------------------- -class Production: - def __init__(self,**kw): - for k,v in kw.items(): - setattr(self,k,v) - self.lr_index = -1 - self.lr0_added = 0 # Flag indicating whether or not added to LR0 closure - self.lr1_added = 0 # Flag indicating whether or not added to LR1 - self.usyms = [ ] +class LRItem(object): + def __init__(self,p,n): + self.name = p.name + self.prod = list(p.prod) + self.number = p.number + self.lr_index = n self.lookaheads = { } - self.lk_added = { } - self.setnumbers = [ ] + self.prod.insert(n,".") + self.prod = tuple(self.prod) + self.len = len(self.prod) + self.usyms = p.usyms def __str__(self): if self.prod: @@ -1248,933 +1247,580 @@ def __str__(self): return s def __repr__(self): - return str(self) - - # Compute lr_items from the production - def lr_item(self,n): - if n > len(self.prod): return None - p = Production() - p.name = self.name - p.prod = list(self.prod) - p.number = self.number - p.lr_index = n - p.lookaheads = { } - p.setnumbers = self.setnumbers - p.prod.insert(n,".") - p.prod = tuple(p.prod) - p.len = len(p.prod) - p.usyms = self.usyms - - # Precompute list of productions immediately following - try: - p.lrafter = Prodnames[p.prod[n+1]] - except (IndexError,KeyError): - p.lrafter = [] - try: - p.lrbefore = p.prod[n-1] - except IndexError: - p.lrbefore = None + return "LRItem("+str(self)+")" - return p + def __len__(self): + return len(self.prod) -class MiniProduction: - pass - -# regex matching identifiers -_is_identifier = re.compile(r'^[a-zA-Z0-9_-]+$') + def __getitem__(self,index): + return self.prod[index] # ----------------------------------------------------------------------------- -# add_production() +# rightmost_terminal() # -# Given an action function, this function assembles a production rule. -# The production rule is assumed to be found in the function's docstring. -# This rule has the general syntax: +# Return the rightmost terminal from a list of symbols. Used in add_production() +# ----------------------------------------------------------------------------- +def rightmost_terminal(symbols, terminals): + i = len(symbols) - 1 + while i >= 0: + if symbols[i] in terminals: + return symbols[i] + i -= 1 + return None + +# ----------------------------------------------------------------------------- +# === GRAMMAR CLASS === # -# name1 ::= production1 -# | production2 -# | production3 -# ... -# | productionn -# name2 ::= production1 -# | production2 -# ... +# The following class represents the contents of the specified grammar along +# with various computed properties such as first sets, follow sets, LR items, etc. +# This data is used for critical parts of the table generation process later. # ----------------------------------------------------------------------------- -def add_production(f,file,line,prodname,syms): - - if prodname in Terminals: - sys.stderr.write("%s:%d: Illegal rule name '%s'. Already defined as a token.\n" % (file,line,prodname)) - return -1 - if prodname == 'error': - sys.stderr.write("%s:%d: Illegal rule name '%s'. error is a reserved word.\n" % (file,line,prodname)) - return -1 - - if not _is_identifier.match(prodname): - sys.stderr.write("%s:%d: Illegal rule name '%s'\n" % (file,line,prodname)) - return -1 - - for x in range(len(syms)): - s = syms[x] - if s[0] in "'\"": - try: - c = eval(s) - if (len(c) > 1): - sys.stderr.write("%s:%d: Literal token %s in rule '%s' may only be a single character\n" % (file,line,s, prodname)) - return -1 - if not c in Terminals: - Terminals[c] = [] - syms[x] = c - continue - except SyntaxError: - pass - if not _is_identifier.match(s) and s != '%prec': - sys.stderr.write("%s:%d: Illegal name '%s' in rule '%s'\n" % (file,line,s, prodname)) - return -1 - - # See if the rule is already in the rulemap - map = "%s -> %s" % (prodname,syms) - if map in Prodmap: - m = Prodmap[map] - sys.stderr.write("%s:%d: Duplicate rule %s.\n" % (file,line, m)) - sys.stderr.write("%s:%d: Previous definition at %s:%d\n" % (file,line, m.file, m.line)) - return -1 - - p = Production() - p.name = prodname - p.prod = syms - p.file = file - p.line = line - p.func = f - p.number = len(Productions) - - - Productions.append(p) - Prodmap[map] = p - if not prodname in Nonterminals: - Nonterminals[prodname] = [ ] - - # Add all terminals to Terminals - i = 0 - while i < len(p.prod): - t = p.prod[i] - if t == '%prec': - try: - precname = p.prod[i+1] - except IndexError: - sys.stderr.write("%s:%d: Syntax error. Nothing follows %%prec.\n" % (p.file,p.line)) - return -1 - - prec = Precedence.get(precname,None) - if not prec: - sys.stderr.write("%s:%d: Nothing known about the precedence of '%s'\n" % (p.file,p.line,precname)) - return -1 - else: - p.prec = prec - UsedPrecedence[precname] = 1 - del p.prod[i] - del p.prod[i] - continue - - if t in Terminals: - Terminals[t].append(p.number) - # Is a terminal. We'll assign a precedence to p based on this - if not hasattr(p,"prec"): - p.prec = Precedence.get(t,('right',0)) - else: - if not t in Nonterminals: - Nonterminals[t] = [ ] - Nonterminals[t].append(p.number) - i += 1 +class GrammarError(YaccError): pass - if not hasattr(p,"prec"): - p.prec = ('right',0) +class Grammar(object): + def __init__(self,terminals): + self.Productions = [None] # A list of all of the productions. The first + # entry is always reserved for the purpose of + # building an augmented grammar - # Set final length of productions - p.len = len(p.prod) - p.prod = tuple(p.prod) + self.Prodnames = { } # A dictionary mapping the names of nonterminals to a list of all + # productions of that nonterminal. - # Calculate unique syms in the production - p.usyms = [ ] - for s in p.prod: - if s not in p.usyms: - p.usyms.append(s) + self.Prodmap = { } # A dictionary that is only used to detect duplicate + # productions. - # Add to the global productions list - try: - Prodnames[p.name].append(p) - except KeyError: - Prodnames[p.name] = [ p ] - return 0 + self.Terminals = { } # A dictionary mapping the names of terminal symbols to a + # list of the rules where they are used. -# Given a raw rule function, this function rips out its doc string -# and adds rules to the grammar + for term in terminals: + self.Terminals[term] = [] -def add_function(f): - line = func_code(f).co_firstlineno - file = func_code(f).co_filename - error = 0 + self.Terminals['error'] = [] - if isinstance(f,types.MethodType): - reqdargs = 2 - else: - reqdargs = 1 - - if func_code(f).co_argcount > reqdargs: - sys.stderr.write("%s:%d: Rule '%s' has too many arguments.\n" % (file,line,f.__name__)) - return -1 - - if func_code(f).co_argcount < reqdargs: - sys.stderr.write("%s:%d: Rule '%s' requires an argument.\n" % (file,line,f.__name__)) - return -1 - - if f.__doc__: - # Split the doc string into lines - pstrings = f.__doc__.splitlines() - lastp = None - dline = line - for ps in pstrings: - dline += 1 - p = ps.split() - if not p: continue - try: - if p[0] == '|': - # This is a continuation of a previous rule - if not lastp: - sys.stderr.write("%s:%d: Misplaced '|'.\n" % (file,dline)) - return -1 - prodname = lastp - if len(p) > 1: - syms = p[1:] - else: - syms = [ ] - else: - prodname = p[0] - lastp = prodname - assign = p[1] - if len(p) > 2: - syms = p[2:] - else: - syms = [ ] - if assign != ':' and assign != '::=': - sys.stderr.write("%s:%d: Syntax error. Expected ':'\n" % (file,dline)) - return -1 + self.Nonterminals = { } # A dictionary mapping names of nonterminals to a list + # of rule numbers where they are used. + self.First = { } # A dictionary of precomputed FIRST(x) symbols - e = add_production(f,file,dline,prodname,syms) - error += e + self.Follow = { } # A dictionary of precomputed FOLLOW(x) symbols + self.Precedence = { } # Precedence rules for each terminal. Contains tuples of the + # form ('right',level) or ('nonassoc', level) or ('left',level) - except Exception: - sys.stderr.write("%s:%d: Syntax error in rule '%s'\n" % (file,dline,ps)) - error -= 1 - else: - sys.stderr.write("%s:%d: No documentation string specified in function '%s'\n" % (file,line,f.__name__)) - return error - - -# Cycle checking code (Michael Dyck) - -def compute_reachable(): - ''' - Find each symbol that can be reached from the start symbol. - Print a warning for any nonterminals that can't be reached. - (Unused terminals have already had their warning.) - ''' - Reachable = { } - for s in list(Terminals) + list(Nonterminals): - Reachable[s] = 0 - - mark_reachable_from( Productions[0].prod[0], Reachable ) - - for s in list(Nonterminals): - if not Reachable[s]: - sys.stderr.write("yacc: Symbol '%s' is unreachable.\n" % s) - -def mark_reachable_from(s, Reachable): - ''' - Mark all symbols that are reachable from symbol s. - ''' - if Reachable[s]: - # We've already reached symbol s. - return - Reachable[s] = 1 - for p in Prodnames.get(s,[]): - for r in p.prod: - mark_reachable_from(r, Reachable) + self.UsedPrecedence = { } # Precedence rules that were actually used by the grammer. + # This is only used to provide error checking and to generate + # a warning about unused precedence rules. -# ----------------------------------------------------------------------------- -# compute_terminates() -# -# This function looks at the various parsing rules and tries to detect -# infinite recursion cycles (grammar rules where there is no possible way -# to derive a string of only terminals). -# ----------------------------------------------------------------------------- -def compute_terminates(): - ''' - Raise an error for any symbols that don't terminate. - ''' - Terminates = {} - - # Terminals: - for t in Terminals: - Terminates[t] = 1 - - Terminates['$end'] = 1 - - # Nonterminals: - - # Initialize to false: - for n in Nonterminals: - Terminates[n] = 0 - - # Then propagate termination until no change: - while 1: - some_change = 0 - for (n,pl) in Prodnames.items(): - # Nonterminal n terminates iff any of its productions terminates. - for p in pl: - # Production p terminates iff all of its rhs symbols terminate. - for s in p.prod: - if not Terminates[s]: - # The symbol s does not terminate, - # so production p does not terminate. - p_terminates = 0 - break - else: - # didn't break from the loop, - # so every symbol s terminates - # so production p terminates. - p_terminates = 1 - - if p_terminates: - # symbol n terminates! - if not Terminates[n]: - Terminates[n] = 1 - some_change = 1 - # Don't need to consider any more productions for this n. - break - - if not some_change: - break - - some_error = 0 - for (s,terminates) in Terminates.items(): - if not terminates: - if not s in Prodnames and not s in Terminals and s != 'error': - # s is used-but-not-defined, and we've already warned of that, - # so it would be overkill to say that it's also non-terminating. - pass - else: - sys.stderr.write("yacc: Infinite recursion detected for symbol '%s'.\n" % s) - some_error = 1 + self.Start = None # Starting symbol for the grammar - return some_error -# ----------------------------------------------------------------------------- -# verify_productions() -# -# This function examines all of the supplied rules to see if they seem valid. -# ----------------------------------------------------------------------------- -def verify_productions(cycle_check=1): - error = 0 - for p in Productions: - if not p: continue + def __len__(self): + return len(self.Productions) - for s in p.prod: - if not s in Prodnames and not s in Terminals and s != 'error': - sys.stderr.write("%s:%d: Symbol '%s' used, but not defined as a token or a rule.\n" % (p.file,p.line,s)) - error = 1 - continue + def __getitem__(self,index): + return self.Productions[index] - unused_tok = 0 - # Now verify all of the tokens - if yaccdebug: - _vf.write("Unused terminals:\n\n") - for s,v in Terminals.items(): - if s != 'error' and not v: - sys.stderr.write("yacc: Warning. Token '%s' defined, but not used.\n" % s) - if yaccdebug: _vf.write(" %s\n"% s) - unused_tok += 1 - - # Print out all of the productions - if yaccdebug: - _vf.write("\nGrammar\n\n") - for i in range(1,len(Productions)): - _vf.write("Rule %-5d %s\n" % (i, Productions[i])) - - unused_prod = 0 - # Verify the use of all productions - for s,v in Nonterminals.items(): - if not v: - p = Prodnames[s][0] - sys.stderr.write("%s:%d: Warning. Rule '%s' defined, but not used.\n" % (p.file,p.line, s)) - unused_prod += 1 - - - if unused_tok == 1: - sys.stderr.write("yacc: Warning. There is 1 unused token.\n") - if unused_tok > 1: - sys.stderr.write("yacc: Warning. There are %d unused tokens.\n" % unused_tok) - - if unused_prod == 1: - sys.stderr.write("yacc: Warning. There is 1 unused rule.\n") - if unused_prod > 1: - sys.stderr.write("yacc: Warning. There are %d unused rules.\n" % unused_prod) - - if yaccdebug: - _vf.write("\nTerminals, with rules where they appear\n\n") - ks = list(Terminals) - ks.sort() - for k in ks: - _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Terminals[k]]))) - _vf.write("\nNonterminals, with rules where they appear\n\n") - ks = list(Nonterminals) - ks.sort() - for k in ks: - _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Nonterminals[k]]))) - - if (cycle_check): - compute_reachable() - error += compute_terminates() -# error += check_cycles() - return error + # ----------------------------------------------------------------------------- + # set_precedence() + # + # Sets the precedence for a given terminal. assoc is the associativity such as + # 'left','right', or 'nonassoc'. level is a numeric level. + # + # ----------------------------------------------------------------------------- + + def set_precedence(self,term,assoc,level): + assert self.Productions == [None],"Must call set_precedence() before add_production()" + if term in self.Precedence: + raise GrammarError("Precedence already specified for terminal '%s'" % term) + if assoc not in ['left','right','nonassoc']: + raise GrammarError("Associativity must be one of 'left','right', or 'nonassoc'") + self.Precedence[term] = (assoc,level) + + # ----------------------------------------------------------------------------- + # add_production() + # + # Given an action function, this function assembles a production rule and + # computes its precedence level. + # + # The production rule is supplied as a list of symbols. For example, + # a rule such as 'expr : expr PLUS term' has a production name of 'expr' and + # symbols ['expr','PLUS','term']. + # + # Precedence is determined by the precedence of the right-most non-terminal + # or the precedence of a terminal specified by %prec. + # + # A variety of error checks are performed to make sure production symbols + # are valid and that %prec is used correctly. + # ----------------------------------------------------------------------------- + + def add_production(self,prodname,syms,func=None,file='',line=0): + + if prodname in self.Terminals: + raise GrammarError("%s:%d: Illegal rule name '%s'. Already defined as a token" % (file,line,prodname)) + if prodname == 'error': + raise GrammarError("%s:%d: Illegal rule name '%s'. error is a reserved word" % (file,line,prodname)) + if not _is_identifier.match(prodname): + raise GrammarError("%s:%d: Illegal rule name '%s'" % (file,line,prodname)) + + # Look for literal tokens + for n,s in enumerate(syms): + if s[0] in "'\"": + try: + c = eval(s) + if (len(c) > 1): + raise GrammarError("%s:%d: Literal token %s in rule '%s' may only be a single character" % (file,line,s, prodname)) + if not c in self.Terminals: + self.Terminals[c] = [] + syms[n] = c + continue + except SyntaxError: + pass + if not _is_identifier.match(s) and s != '%prec': + raise GrammarError("%s:%d: Illegal name '%s' in rule '%s'" % (file,line,s, prodname)) + + # Determine the precedence level + if '%prec' in syms: + if syms[-1] == '%prec': + raise GrammarError("%s:%d: Syntax error. Nothing follows %%prec" % (file,line)) + if syms[-2] != '%prec': + raise GrammarError("%s:%d: Syntax error. %%prec can only appear at the end of a grammar rule" % (file,line)) + precname = syms[-1] + prodprec = self.Precedence.get(precname,None) + if not prodprec: + raise GrammarError("%s:%d: Nothing known about the precedence of '%s'" % (file,line,precname)) + else: + self.UsedPrecedence[precname] = 1 + del syms[-2:] # Drop %prec from the rule + else: + # If no %prec, precedence is determined by the rightmost terminal symbol + precname = rightmost_terminal(syms,self.Terminals) + prodprec = self.Precedence.get(precname,('right',0)) + + # See if the rule is already in the rulemap + map = "%s -> %s" % (prodname,syms) + if map in self.Prodmap: + m = self.Prodmap[map] + raise GrammarError("%s:%d: Duplicate rule %s. " % (file,line, m) + + "Previous definition at %s:%d" % (file,line, m.file, m.line)) + + # From this point on, everything is valid. Create a new Production instance + pnumber = len(self.Productions) + if not prodname in self.Nonterminals: + self.Nonterminals[prodname] = [ ] + + # Add the production number to Terminals and Nonterminals + for t in syms: + if t in self.Terminals: + self.Terminals[t].append(pnumber) + else: + if not t in self.Nonterminals: + self.Nonterminals[t] = [ ] + self.Nonterminals[t].append(pnumber) -# ----------------------------------------------------------------------------- -# build_lritems() -# -# This function walks the list of productions and builds a complete set of the -# LR items. The LR items are stored in two ways: First, they are uniquely -# numbered and placed in the list _lritems. Second, a linked list of LR items -# is built for each production. For example: -# -# E -> E PLUS E -# -# Creates the list -# -# [E -> . E PLUS E, E -> E . PLUS E, E -> E PLUS . E, E -> E PLUS E . ] -# ----------------------------------------------------------------------------- + # Create a production and add it to the list of productions + p = Production(pnumber,prodname,syms,prodprec,func,file,line) + self.Productions.append(p) + self.Prodmap[map] = p -def build_lritems(): - for p in Productions: - lastlri = p - lri = p.lr_item(0) - i = 0 - while 1: - lri = p.lr_item(i) - lastlri.lr_next = lri - if not lri: break - lri.lr_num = len(LRitems) - LRitems.append(lri) - lastlri = lri - i += 1 + # Add to the global productions list + try: + self.Prodnames[prodname].append(p) + except KeyError: + self.Prodnames[prodname] = [ p ] + return 0 - # In order for the rest of the parser generator to work, we need to - # guarantee that no more lritems are generated. Therefore, we nuke - # the p.lr_item method. (Only used in debugging) - # Production.lr_item = None + # ----------------------------------------------------------------------------- + # set_start() + # + # Sets the starting symbol and creates the augmented grammar. Production + # rule 0 is S' -> start where start is the start symbol. + # ----------------------------------------------------------------------------- + + def set_start(self,start=None): + if not start: + start = self.Productions[1].name + if start not in self.Nonterminals: + raise GrammarError("start symbol %s undefined" % start) + self.Productions[0] = Production(0,"S'",[start]) + self.Nonterminals[start].append(0) + self.Start = start + + # ----------------------------------------------------------------------------- + # find_unreachable() + # + # Find all of the nonterminal symbols that can't be reached from the starting + # symbol. Returns a list of nonterminals that can't be reached. + # ----------------------------------------------------------------------------- -# ----------------------------------------------------------------------------- -# add_precedence() -# -# Given a list of precedence rules, add to the precedence table. -# ----------------------------------------------------------------------------- + def find_unreachable(self): + + # Mark all symbols that are reachable from a symbol s + def mark_reachable_from(s): + if reachable[s]: + # We've already reached symbol s. + return + reachable[s] = 1 + for p in self.Prodnames.get(s,[]): + for r in p.prod: + mark_reachable_from(r) + + reachable = { } + for s in list(self.Terminals) + list(self.Nonterminals): + reachable[s] = 0 + + mark_reachable_from( self.Productions[0].prod[0] ) + + return [s for s in list(self.Nonterminals) + if not reachable[s]] + + # ----------------------------------------------------------------------------- + # infinite_cycles() + # + # This function looks at the various parsing rules and tries to detect + # infinite recursion cycles (grammar rules where there is no possible way + # to derive a string of only terminals). + # ----------------------------------------------------------------------------- -def add_precedence(plist): - plevel = 0 - error = 0 - for p in plist: - plevel += 1 - try: - prec = p[0] - terms = p[1:] - if prec != 'left' and prec != 'right' and prec != 'nonassoc': - sys.stderr.write("yacc: Invalid precedence '%s'\n" % prec) - return -1 - for t in terms: - if t in Precedence: - sys.stderr.write("yacc: Precedence already specified for terminal '%s'\n" % t) - error += 1 - continue - Precedence[t] = (prec,plevel) - except Exception: - sys.stderr.write("yacc: Invalid precedence table.\n") - error += 1 + def infinite_cycles(self): + terminates = {} - return error + # Terminals: + for t in self.Terminals: + terminates[t] = 1 -# ----------------------------------------------------------------------------- -# check_precedence() -# -# Checks the use of the Precedence tables. This makes sure all of the symbols -# are terminals or were used with %prec -# ----------------------------------------------------------------------------- + terminates['$end'] = 1 -def check_precedence(): - error = 0 - for precname in Precedence: - if not (precname in Terminals or precname in UsedPrecedence): - sys.stderr.write("yacc: Precedence rule '%s' defined for unknown symbol '%s'\n" % (Precedence[precname][0],precname)) - error += 1 - return error + # Nonterminals: -# ----------------------------------------------------------------------------- -# augment_grammar() -# -# Compute the augmented grammar. This is just a rule S' -> start where start -# is the starting symbol. -# ----------------------------------------------------------------------------- + # Initialize to false: + for n in self.Nonterminals: + terminates[n] = 0 -def augment_grammar(start=None): - if not start: - start = Productions[1].name - Productions[0] = Production(name="S'",prod=[start],number=0,len=1,prec=('right',0),func=None) - Productions[0].usyms = [ start ] - Nonterminals[start].append(0) + # Then propagate termination until no change: + while 1: + some_change = 0 + for (n,pl) in self.Prodnames.items(): + # Nonterminal n terminates iff any of its productions terminates. + for p in pl: + # Production p terminates iff all of its rhs symbols terminate. + for s in p.prod: + if not terminates[s]: + # The symbol s does not terminate, + # so production p does not terminate. + p_terminates = 0 + break + else: + # didn't break from the loop, + # so every symbol s terminates + # so production p terminates. + p_terminates = 1 + + if p_terminates: + # symbol n terminates! + if not terminates[n]: + terminates[n] = 1 + some_change = 1 + # Don't need to consider any more productions for this n. + break + if not some_change: + break -# ------------------------------------------------------------------------- -# first() -# -# Compute the value of FIRST1(beta) where beta is a tuple of symbols. -# -# During execution of compute_first1, the result may be incomplete. -# Afterward (e.g., when called from compute_follow()), it will be complete. -# ------------------------------------------------------------------------- -def first(beta): - - # We are computing First(x1,x2,x3,...,xn) - result = [ ] - for x in beta: - x_produces_empty = 0 - - # Add all the non- symbols of First[x] to the result. - for f in First[x]: - if f == ' ': - x_produces_empty = 1 - else: - if f not in result: result.append(f) + infinite = [] + for (s,term) in terminates.items(): + if not term: + if not s in self.Prodnames and not s in self.Terminals and s != 'error': + # s is used-but-not-defined, and we've already warned of that, + # so it would be overkill to say that it's also non-terminating. + pass + else: + infinite.append(s) - if x_produces_empty: - # We have to consider the next x in beta, - # i.e. stay in the loop. - pass - else: - # We don't have to consider any further symbols in beta. - break - else: - # There was no 'break' from the loop, - # so x_produces_empty was true for all x in beta, - # so beta produces empty as well. - result.append(' ') - - return result - - -# FOLLOW(x) -# Given a non-terminal. This function computes the set of all symbols -# that might follow it. Dragon book, p. 189. - -def compute_follow(start=None): - # Add '$end' to the follow list of the start symbol - for k in Nonterminals: - Follow[k] = [ ] - - if not start: - start = Productions[1].name - - Follow[start] = [ '$end' ] - - while 1: - didadd = 0 - for p in Productions[1:]: - # Here is the production set - for i in range(len(p.prod)): - B = p.prod[i] - if B in Nonterminals: - # Okay. We got a non-terminal in a production - fst = first(p.prod[i+1:]) - hasempty = 0 - for f in fst: - if f != ' ' and f not in Follow[B]: - Follow[B].append(f) - didadd = 1 - if f == ' ': - hasempty = 1 - if hasempty or i == (len(p.prod)-1): - # Add elements of follow(a) to follow(b) - for f in Follow[p.name]: - if f not in Follow[B]: - Follow[B].append(f) - didadd = 1 - if not didadd: break + return infinite - if 0 and yaccdebug: - _vf.write('\nFollow:\n') - for k in Nonterminals: - _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Follow[k]]))) -# ------------------------------------------------------------------------- -# compute_first1() -# -# Compute the value of FIRST1(X) for all symbols -# ------------------------------------------------------------------------- -def compute_first1(): - - # Terminals: - for t in Terminals: - First[t] = [t] - - First['$end'] = ['$end'] - First['#'] = ['#'] # what's this for? - - # Nonterminals: - - # Initialize to the empty set: - for n in Nonterminals: - First[n] = [] - - # Then propagate symbols until no change: - while 1: - some_change = 0 - for n in Nonterminals: - for p in Prodnames[n]: - for f in first(p.prod): - if f not in First[n]: - First[n].append( f ) - some_change = 1 - if not some_change: - break - - if 0 and yaccdebug: - _vf.write('\nFirst:\n') - for k in Nonterminals: - _vf.write("%-20s : %s\n" % - (k, " ".join([str(s) for s in First[k]]))) + # ----------------------------------------------------------------------------- + # undefined_symbols() + # + # Find all symbols that were used the grammar, but not defined as tokens or + # grammar rules. Returns a list of tuples (sym, prod) where sym in the symbol + # and prod is the production where the symbol was used. + # ----------------------------------------------------------------------------- + def undefined_symbols(self): + result = [] + for p in self.Productions: + if not p: continue -# ----------------------------------------------------------------------------- -# === SLR Generation === -# -# The following functions are used to construct SLR (Simple LR) parsing tables -# as described on p.221-229 of the dragon book. -# ----------------------------------------------------------------------------- + for s in p.prod: + if not s in self.Prodnames and not s in self.Terminals and s != 'error': + result.append((s,p)) + return result -# Global variables for the LR parsing engine -def lr_init_vars(): - global _lr_action, _lr_goto, _lr_method - global _lr_goto_cache, _lr0_cidhash - - _lr_action = { } # Action table - _lr_goto = { } # Goto table - _lr_method = "Unknown" # LR method used - _lr_goto_cache = { } - _lr0_cidhash = { } - - -# Compute the LR(0) closure operation on I, where I is a set of LR(0) items. -# prodlist is a list of productions. - -_add_count = 0 # Counter used to detect cycles - -def lr0_closure(I): - global _add_count - - _add_count += 1 - prodlist = Productions - - # Add everything in I to J - J = I[:] - didadd = 1 - while didadd: - didadd = 0 - for j in J: - for x in j.lrafter: - if x.lr0_added == _add_count: continue - # Add B --> .G to J - J.append(x.lr_next) - x.lr0_added = _add_count - didadd = 1 - - return J - -# Compute the LR(0) goto function goto(I,X) where I is a set -# of LR(0) items and X is a grammar symbol. This function is written -# in a way that guarantees uniqueness of the generated goto sets -# (i.e. the same goto set will never be returned as two different Python -# objects). With uniqueness, we can later do fast set comparisons using -# id(obj) instead of element-wise comparison. - -def lr0_goto(I,x): - # First we look for a previously cached entry - g = _lr_goto_cache.get((id(I),x),None) - if g: return g - - # Now we generate the goto set in a way that guarantees uniqueness - # of the result - - s = _lr_goto_cache.get(x,None) - if not s: - s = { } - _lr_goto_cache[x] = s - - gs = [ ] - for p in I: - n = p.lr_next - if n and n.lrbefore == x: - s1 = s.get(id(n),None) - if not s1: - s1 = { } - s[id(n)] = s1 - gs.append(n) - s = s1 - g = s.get('$end',None) - if not g: - if gs: - g = lr0_closure(gs) - s['$end'] = g + # ----------------------------------------------------------------------------- + # unused_terminals() + # + # Find all terminals that were defined, but not used by the grammar. Returns + # a list of all symbols. + # ----------------------------------------------------------------------------- + def unused_terminals(self): + unused_tok = [] + for s,v in self.Terminals.items(): + if s != 'error' and not v: + unused_tok.append(s) + + return unused_tok + + # ------------------------------------------------------------------------------ + # unused_rules() + # + # Find all grammar rules that were defined, but not used (maybe not reachable) + # Returns a list of productions. + # ------------------------------------------------------------------------------ + + def unused_rules(self): + unused_prod = [] + for s,v in self.Nonterminals.items(): + if not v: + p = self.Prodnames[s][0] + unused_prod.append(p) + return unused_prod + + # ----------------------------------------------------------------------------- + # unused_precedence() + # + # Returns a list of tuples (term,precedence) corresponding to precedence + # rules that were never used by the grammar. term is the name of the terminal + # on which precedence was applied and precedence is a string such as 'left' or + # 'right' corresponding to the type of precedence. + # ----------------------------------------------------------------------------- + + def unused_precedence(self): + unused = [] + for termname in self.Precedence: + if not (termname in self.Terminals or termname in self.UsedPrecedence): + unused.append((termname,self.Precedence[termname][0])) + + return unused + + # ------------------------------------------------------------------------- + # _first() + # + # Compute the value of FIRST1(beta) where beta is a tuple of symbols. + # + # During execution of compute_first1, the result may be incomplete. + # Afterward (e.g., when called from compute_follow()), it will be complete. + # ------------------------------------------------------------------------- + def _first(self,beta): + + # We are computing First(x1,x2,x3,...,xn) + result = [ ] + for x in beta: + x_produces_empty = 0 + + # Add all the non- symbols of First[x] to the result. + for f in self.First[x]: + if f == ' ': + x_produces_empty = 1 + else: + if f not in result: result.append(f) + + if x_produces_empty: + # We have to consider the next x in beta, + # i.e. stay in the loop. + pass + else: + # We don't have to consider any further symbols in beta. + break else: - s['$end'] = gs - _lr_goto_cache[(id(I),x)] = g - return g - -_lr0_cidhash = { } - -# Compute the LR(0) sets of item function -def lr0_items(): - - C = [ lr0_closure([Productions[0].lr_next]) ] - i = 0 - for I in C: - _lr0_cidhash[id(I)] = i - i += 1 - - # Loop over the items in C and each grammar symbols - i = 0 - while i < len(C): - I = C[i] - i += 1 - - # Collect all of the symbols that could possibly be in the goto(I,X) sets - asyms = { } - for ii in I: - for s in ii.usyms: - asyms[s] = None - - for x in asyms: - g = lr0_goto(I,x) - if not g: continue - if id(g) in _lr0_cidhash: continue - _lr0_cidhash[id(g)] = len(C) - C.append(g) - - return C + # There was no 'break' from the loop, + # so x_produces_empty was true for all x in beta, + # so beta produces empty as well. + result.append(' ') -# ----------------------------------------------------------------------------- -# ==== LALR(1) Parsing ==== -# -# LALR(1) parsing is almost exactly the same as SLR except that instead of -# relying upon Follow() sets when performing reductions, a more selective -# lookahead set that incorporates the state of the LR(0) machine is utilized. -# Thus, we mainly just have to focus on calculating the lookahead sets. -# -# The method used here is due to DeRemer and Pennelo (1982). -# -# DeRemer, F. L., and T. J. Pennelo: "Efficient Computation of LALR(1) -# Lookahead Sets", ACM Transactions on Programming Languages and Systems, -# Vol. 4, No. 4, Oct. 1982, pp. 615-649 -# -# Further details can also be found in: -# -# J. Tremblay and P. Sorenson, "The Theory and Practice of Compiler Writing", -# McGraw-Hill Book Company, (1985). -# -# Note: This implementation is a complete replacement of the LALR(1) -# implementation in PLY-1.x releases. That version was based on -# a less efficient algorithm and it had bugs in its implementation. -# ----------------------------------------------------------------------------- + return result -# ----------------------------------------------------------------------------- -# compute_nullable_nonterminals() -# -# Creates a dictionary containing all of the non-terminals that might produce -# an empty production. -# ----------------------------------------------------------------------------- + # ------------------------------------------------------------------------- + # compute_first() + # + # Compute the value of FIRST1(X) for all symbols + # ------------------------------------------------------------------------- + def compute_first(self): + if self.First: + return self.First -def compute_nullable_nonterminals(): - nullable = {} - num_nullable = 0 - while 1: - for p in Productions[1:]: - if p.len == 0: - nullable[p.name] = 1 - continue - for t in p.prod: - if not t in nullable: break - else: - nullable[p.name] = 1 - if len(nullable) == num_nullable: break - num_nullable = len(nullable) - return nullable + # Terminals: + for t in self.Terminals: + self.First[t] = [t] -# ----------------------------------------------------------------------------- -# find_nonterminal_trans(C) -# -# Given a set of LR(0) items, this functions finds all of the non-terminal -# transitions. These are transitions in which a dot appears immediately before -# a non-terminal. Returns a list of tuples of the form (state,N) where state -# is the state number and N is the nonterminal symbol. -# -# The input C is the set of LR(0) items. -# ----------------------------------------------------------------------------- + self.First['$end'] = ['$end'] -def find_nonterminal_transitions(C): - trans = [] - for state in range(len(C)): - for p in C[state]: - if p.lr_index < p.len - 1: - t = (state,p.prod[p.lr_index+1]) - if t[1] in Nonterminals: - if t not in trans: trans.append(t) - state = state + 1 - return trans + # Nonterminals: -# ----------------------------------------------------------------------------- -# dr_relation() -# -# Computes the DR(p,A) relationships for non-terminal transitions. The input -# is a tuple (state,N) where state is a number and N is a nonterminal symbol. -# -# Returns a list of terminals. -# ----------------------------------------------------------------------------- + # Initialize to the empty set: + for n in self.Nonterminals: + self.First[n] = [] + + # Then propagate symbols until no change: + while 1: + some_change = 0 + for n in self.Nonterminals: + for p in self.Prodnames[n]: + for f in self._first(p.prod): + if f not in self.First[n]: + self.First[n].append( f ) + some_change = 1 + if not some_change: + break + + return self.First + + # --------------------------------------------------------------------- + # compute_follow() + # + # Computes all of the follow sets for every non-terminal symbol. The + # follow set is the set of all symbols that might follow a given + # non-terminal. See the Dragon book, 2nd Ed. p. 189. + # --------------------------------------------------------------------- + def compute_follow(self,start=None): + # If already computed, return the result + if self.Follow: + return self.Follow + + # If first sets not computed yet, do that first. + if not self.First: + self.compute_first() -def dr_relation(C,trans,nullable): - dr_set = { } - state,N = trans - terms = [] + # Add '$end' to the follow list of the start symbol + for k in self.Nonterminals: + self.Follow[k] = [ ] - g = lr0_goto(C[state],N) - for p in g: - if p.lr_index < p.len - 1: - a = p.prod[p.lr_index+1] - if a in Terminals: - if a not in terms: terms.append(a) + if not start: + start = self.Productions[1].name - # This extra bit is to handle the start state - if state == 0 and N == Productions[0].prod[0]: - terms.append('$end') + self.Follow[start] = [ '$end' ] - return terms + while 1: + didadd = 0 + for p in self.Productions[1:]: + # Here is the production set + for i in range(len(p.prod)): + B = p.prod[i] + if B in self.Nonterminals: + # Okay. We got a non-terminal in a production + fst = self._first(p.prod[i+1:]) + hasempty = 0 + for f in fst: + if f != ' ' and f not in self.Follow[B]: + self.Follow[B].append(f) + didadd = 1 + if f == ' ': + hasempty = 1 + if hasempty or i == (len(p.prod)-1): + # Add elements of follow(a) to follow(b) + for f in self.Follow[p.name]: + if f not in self.Follow[B]: + self.Follow[B].append(f) + didadd = 1 + if not didadd: break + return self.Follow + + + # ----------------------------------------------------------------------------- + # build_lritems() + # + # This function walks the list of productions and builds a complete set of the + # LR items. The LR items are stored in two ways: First, they are uniquely + # numbered and placed in the list _lritems. Second, a linked list of LR items + # is built for each production. For example: + # + # E -> E PLUS E + # + # Creates the list + # + # [E -> . E PLUS E, E -> E . PLUS E, E -> E PLUS . E, E -> E PLUS E . ] + # ----------------------------------------------------------------------------- + + def build_lritems(self): + for p in self.Productions: + lastlri = p + i = 0 + lr_items = [] + while 1: + if i > len(p): + lri = None + else: + lri = LRItem(p,i) + # Precompute the list of productions immediately following + try: + lri.lr_after = self.Prodnames[lri.prod[i+1]] + except (IndexError,KeyError): + lri.lr_after = [] + try: + lri.lr_before = lri.prod[i-1] + except IndexError: + lri.lr_before = None + + lastlri.lr_next = lri + if not lri: break + lr_items.append(lri) + lastlri = lri + i += 1 + p.lr_items = lr_items # ----------------------------------------------------------------------------- -# reads_relation() +# == Class LRTable == # -# Computes the READS() relation (p,A) READS (t,C). +# This basic class represents a basic table of LR parsing information. +# Methods for generating the tables are not defined here. They are defined +# in the derived class LRGeneratedTable. # ----------------------------------------------------------------------------- -def reads_relation(C, trans, empty): - # Look for empty transitions - rel = [] - state, N = trans +class VersionError(YaccError): pass + +class LRTable(object): + def __init__(self): + self.lr_action = None + self.lr_goto = None + self.lr_productions = None + self.lr_method = None + + def read_table(self,module): + if isinstance(module,types.ModuleType): + parsetab = module + else: + if sys.version_info[0] < 3: + exec("import %s as parsetab" % module) + else: + env = { } + exec("import %s as parsetab" % module, env, env) + parsetab = env['parsetab'] + + if parsetab._tabversion != __tabversion__: + raise VersionError("yacc table file version is out of date") + + self.lr_action = parsetab._lr_action + self.lr_goto = parsetab._lr_goto - g = lr0_goto(C[state],N) - j = _lr0_cidhash.get(id(g),-1) - for p in g: - if p.lr_index < p.len - 1: - a = p.prod[p.lr_index + 1] - if a in empty: - rel.append((j,a)) + self.lr_productions = [] + for p in parsetab._lr_productions: + self.lr_productions.append(MiniProduction(*p)) - return rel + self.lr_method = parsetab._lr_method + return parsetab._lr_signature + # Bind all production function names to callable objects in pdict + def bind_callables(self,pdict): + for p in self.lr_productions: + p.bind(pdict) + # ----------------------------------------------------------------------------- -# compute_lookback_includes() -# -# Determines the lookback and includes relations -# -# LOOKBACK: -# -# This relation is determined by running the LR(0) state machine forward. -# For example, starting with a production "N : . A B C", we run it forward -# to obtain "N : A B C ." We then build a relationship between this final -# state and the starting state. These relationships are stored in a dictionary -# lookdict. -# -# INCLUDES: -# -# Computes the INCLUDE() relation (p,A) INCLUDES (p',B). -# -# This relation is used to determine non-terminal transitions that occur -# inside of other non-terminal transition states. (p,A) INCLUDES (p', B) -# if the following holds: -# -# B -> LAT, where T -> epsilon and p' -L-> p -# -# L is essentially a prefix (which may be empty), T is a suffix that must be -# able to derive an empty string. State p' must lead to state p with the string L. +# === LR Generator === # +# The following classes and functions are used to generate LR parsing tables on +# a grammar. # ----------------------------------------------------------------------------- -def compute_lookback_includes(C,trans,nullable): - - lookdict = {} # Dictionary of lookback relations - includedict = {} # Dictionary of include relations - - # Make a dictionary of non-terminal transitions - dtrans = {} - for t in trans: - dtrans[t] = 1 - - # Loop over all transitions and compute lookbacks and includes - for state,N in trans: - lookb = [] - includes = [] - for p in C[state]: - if p.name != N: continue - - # Okay, we have a name match. We now follow the production all the way - # through the state machine until we get the . on the right hand side - - lr_index = p.lr_index - j = state - while lr_index < p.len - 1: - lr_index = lr_index + 1 - t = p.prod[lr_index] - - # Check to see if this symbol and state are a non-terminal transition - if (j,t) in dtrans: - # Yes. Okay, there is some chance that this is an includes relation - # the only way to know for certain is whether the rest of the - # production derives empty - - li = lr_index + 1 - while li < p.len: - if p.prod[li] in Terminals: break # No forget it - if not p.prod[li] in nullable: break - li = li + 1 - else: - # Appears to be a relation between (j,t) and (state,N) - includes.append((j,t)) - - g = lr0_goto(C[j],t) # Go to next set - j = _lr0_cidhash.get(id(g),-1) # Go to next state - - # When we get here, j is the final state, now we have to locate the production - for r in C[j]: - if r.name != p.name: continue - if r.len != p.len: continue - i = 0 - # This look is comparing a production ". A B C" with "A B C ." - while i < r.lr_index: - if r.prod[i] != p.prod[i+1]: break - i = i + 1 - else: - lookb.append((j,r)) - for i in includes: - if not i in includedict: includedict[i] = [] - includedict[i].append((state,N)) - lookdict[(state,N)] = lookb - - return lookdict,includedict - # ----------------------------------------------------------------------------- # digraph() # traverse() @@ -2224,349 +1870,659 @@ def traverse(x,N,stack,F,X,R,FP): F[stack[-1]] = F[x] element = stack.pop() +class LALRError(YaccError): pass + # ----------------------------------------------------------------------------- -# compute_read_sets() +# == LRGeneratedTable == # -# Given a set of LR(0) items, this function computes the read sets. -# -# Inputs: C = Set of LR(0) items -# ntrans = Set of nonterminal transitions -# nullable = Set of empty transitions -# -# Returns a set containing the read sets +# This class implements the LR table generation algorithm. There are no +# public methods except for write() # ----------------------------------------------------------------------------- -def compute_read_sets(C, ntrans, nullable): - FP = lambda x: dr_relation(C,x,nullable) - R = lambda x: reads_relation(C,x,nullable) - F = digraph(ntrans,R,FP) - return F +class LRGeneratedTable(LRTable): + def __init__(self,grammar,method='LALR',log=None): + if method not in ['SLR','LALR']: + raise LALRError("Unsupported method %s" % method) + + self.grammar = grammar + self.lr_method = method + + # Set up the logger + if not log: + log = NullLogger() + self.log = log + + # Internal attributes + self.lr_action = {} # Action table + self.lr_goto = {} # Goto table + self.lr_productions = grammar.Productions # Copy of grammar Production array + self.lr_goto_cache = {} # Cache of computed gotos + self.lr0_cidhash = {} # Cache of closures + + self._add_count = 0 # Internal counter used to detect cycles + + # Diagonistic information filled in by the table generator + self.sr_conflict = 0 + self.rr_conflict = 0 + self.conflicts = [] # List of conflicts + + self.sr_conflicts = [] + self.rr_conflicts = [] + + # Build the tables + self.grammar.build_lritems() + self.grammar.compute_first() + self.grammar.compute_follow() + self.lr_parse_table() + + # Compute the LR(0) closure operation on I, where I is a set of LR(0) items. + + def lr0_closure(self,I): + self._add_count += 1 + + # Add everything in I to J + J = I[:] + didadd = 1 + while didadd: + didadd = 0 + for j in J: + for x in j.lr_after: + if getattr(x,"lr0_added",0) == self._add_count: continue + # Add B --> .G to J + J.append(x.lr_next) + x.lr0_added = self._add_count + didadd = 1 + + return J + + # Compute the LR(0) goto function goto(I,X) where I is a set + # of LR(0) items and X is a grammar symbol. This function is written + # in a way that guarantees uniqueness of the generated goto sets + # (i.e. the same goto set will never be returned as two different Python + # objects). With uniqueness, we can later do fast set comparisons using + # id(obj) instead of element-wise comparison. + + def lr0_goto(self,I,x): + # First we look for a previously cached entry + g = self.lr_goto_cache.get((id(I),x),None) + if g: return g + + # Now we generate the goto set in a way that guarantees uniqueness + # of the result + + s = self.lr_goto_cache.get(x,None) + if not s: + s = { } + self.lr_goto_cache[x] = s + + gs = [ ] + for p in I: + n = p.lr_next + if n and n.lr_before == x: + s1 = s.get(id(n),None) + if not s1: + s1 = { } + s[id(n)] = s1 + gs.append(n) + s = s1 + g = s.get('$end',None) + if not g: + if gs: + g = self.lr0_closure(gs) + s['$end'] = g + else: + s['$end'] = gs + self.lr_goto_cache[(id(I),x)] = g + return g -# ----------------------------------------------------------------------------- -# compute_follow_sets() -# -# Given a set of LR(0) items, a set of non-terminal transitions, a readset, -# and an include set, this function computes the follow sets -# -# Follow(p,A) = Read(p,A) U U {Follow(p',B) | (p,A) INCLUDES (p',B)} -# -# Inputs: -# ntrans = Set of nonterminal transitions -# readsets = Readset (previously computed) -# inclsets = Include sets (previously computed) -# -# Returns a set containing the follow sets -# ----------------------------------------------------------------------------- + # Compute the LR(0) sets of item function + def lr0_items(self): -def compute_follow_sets(ntrans,readsets,inclsets): - FP = lambda x: readsets[x] - R = lambda x: inclsets.get(x,[]) - F = digraph(ntrans,R,FP) - return F + C = [ self.lr0_closure([self.grammar.Productions[0].lr_next]) ] + i = 0 + for I in C: + self.lr0_cidhash[id(I)] = i + i += 1 -# ----------------------------------------------------------------------------- -# add_lookaheads() -# -# Attaches the lookahead symbols to grammar rules. -# -# Inputs: lookbacks - Set of lookback relations -# followset - Computed follow set -# -# This function directly attaches the lookaheads to productions contained -# in the lookbacks set -# ----------------------------------------------------------------------------- + # Loop over the items in C and each grammar symbols + i = 0 + while i < len(C): + I = C[i] + i += 1 -def add_lookaheads(lookbacks,followset): - for trans,lb in lookbacks.items(): - # Loop over productions in lookback - for state,p in lb: - if not state in p.lookaheads: - p.lookaheads[state] = [] - f = followset.get(trans,[]) - for a in f: - if a not in p.lookaheads[state]: p.lookaheads[state].append(a) + # Collect all of the symbols that could possibly be in the goto(I,X) sets + asyms = { } + for ii in I: + for s in ii.usyms: + asyms[s] = None -# ----------------------------------------------------------------------------- -# add_lalr_lookaheads() -# -# This function does all of the work of adding lookahead information for use -# with LALR parsing -# ----------------------------------------------------------------------------- + for x in asyms: + g = self.lr0_goto(I,x) + if not g: continue + if id(g) in self.lr0_cidhash: continue + self.lr0_cidhash[id(g)] = len(C) + C.append(g) -def add_lalr_lookaheads(C): - # Determine all of the nullable nonterminals - nullable = compute_nullable_nonterminals() + return C - # Find all non-terminal transitions - trans = find_nonterminal_transitions(C) + # ----------------------------------------------------------------------------- + # ==== LALR(1) Parsing ==== + # + # LALR(1) parsing is almost exactly the same as SLR except that instead of + # relying upon Follow() sets when performing reductions, a more selective + # lookahead set that incorporates the state of the LR(0) machine is utilized. + # Thus, we mainly just have to focus on calculating the lookahead sets. + # + # The method used here is due to DeRemer and Pennelo (1982). + # + # DeRemer, F. L., and T. J. Pennelo: "Efficient Computation of LALR(1) + # Lookahead Sets", ACM Transactions on Programming Languages and Systems, + # Vol. 4, No. 4, Oct. 1982, pp. 615-649 + # + # Further details can also be found in: + # + # J. Tremblay and P. Sorenson, "The Theory and Practice of Compiler Writing", + # McGraw-Hill Book Company, (1985). + # + # ----------------------------------------------------------------------------- - # Compute read sets - readsets = compute_read_sets(C,trans,nullable) + # ----------------------------------------------------------------------------- + # compute_nullable_nonterminals() + # + # Creates a dictionary containing all of the non-terminals that might produce + # an empty production. + # ----------------------------------------------------------------------------- - # Compute lookback/includes relations - lookd, included = compute_lookback_includes(C,trans,nullable) + def compute_nullable_nonterminals(self): + nullable = {} + num_nullable = 0 + while 1: + for p in self.grammar.Productions[1:]: + if p.len == 0: + nullable[p.name] = 1 + continue + for t in p.prod: + if not t in nullable: break + else: + nullable[p.name] = 1 + if len(nullable) == num_nullable: break + num_nullable = len(nullable) + return nullable + + # ----------------------------------------------------------------------------- + # find_nonterminal_trans(C) + # + # Given a set of LR(0) items, this functions finds all of the non-terminal + # transitions. These are transitions in which a dot appears immediately before + # a non-terminal. Returns a list of tuples of the form (state,N) where state + # is the state number and N is the nonterminal symbol. + # + # The input C is the set of LR(0) items. + # ----------------------------------------------------------------------------- + + def find_nonterminal_transitions(self,C): + trans = [] + for state in range(len(C)): + for p in C[state]: + if p.lr_index < p.len - 1: + t = (state,p.prod[p.lr_index+1]) + if t[1] in self.grammar.Nonterminals: + if t not in trans: trans.append(t) + state = state + 1 + return trans + + # ----------------------------------------------------------------------------- + # dr_relation() + # + # Computes the DR(p,A) relationships for non-terminal transitions. The input + # is a tuple (state,N) where state is a number and N is a nonterminal symbol. + # + # Returns a list of terminals. + # ----------------------------------------------------------------------------- - # Compute LALR FOLLOW sets - followsets = compute_follow_sets(trans,readsets,included) + def dr_relation(self,C,trans,nullable): + dr_set = { } + state,N = trans + terms = [] - # Add all of the lookaheads - add_lookaheads(lookd,followsets) + g = self.lr0_goto(C[state],N) + for p in g: + if p.lr_index < p.len - 1: + a = p.prod[p.lr_index+1] + if a in self.grammar.Terminals: + if a not in terms: terms.append(a) -# ----------------------------------------------------------------------------- -# lr_parse_table() -# -# This function constructs the parse tables for SLR or LALR -# ----------------------------------------------------------------------------- -def lr_parse_table(method): - global _lr_method - goto = _lr_goto # Goto array - action = _lr_action # Action array - actionp = { } # Action production array (temporary) + # This extra bit is to handle the start state + if state == 0 and N == self.grammar.Productions[0].prod[0]: + terms.append('$end') - _lr_method = method + return terms - n_srconflict = 0 - n_rrconflict = 0 + # ----------------------------------------------------------------------------- + # reads_relation() + # + # Computes the READS() relation (p,A) READS (t,C). + # ----------------------------------------------------------------------------- + + def reads_relation(self,C, trans, empty): + # Look for empty transitions + rel = [] + state, N = trans + + g = self.lr0_goto(C[state],N) + j = self.lr0_cidhash.get(id(g),-1) + for p in g: + if p.lr_index < p.len - 1: + a = p.prod[p.lr_index + 1] + if a in empty: + rel.append((j,a)) + + return rel + + # ----------------------------------------------------------------------------- + # compute_lookback_includes() + # + # Determines the lookback and includes relations + # + # LOOKBACK: + # + # This relation is determined by running the LR(0) state machine forward. + # For example, starting with a production "N : . A B C", we run it forward + # to obtain "N : A B C ." We then build a relationship between this final + # state and the starting state. These relationships are stored in a dictionary + # lookdict. + # + # INCLUDES: + # + # Computes the INCLUDE() relation (p,A) INCLUDES (p',B). + # + # This relation is used to determine non-terminal transitions that occur + # inside of other non-terminal transition states. (p,A) INCLUDES (p', B) + # if the following holds: + # + # B -> LAT, where T -> epsilon and p' -L-> p + # + # L is essentially a prefix (which may be empty), T is a suffix that must be + # able to derive an empty string. State p' must lead to state p with the string L. + # + # ----------------------------------------------------------------------------- + + def compute_lookback_includes(self,C,trans,nullable): + + lookdict = {} # Dictionary of lookback relations + includedict = {} # Dictionary of include relations + + # Make a dictionary of non-terminal transitions + dtrans = {} + for t in trans: + dtrans[t] = 1 + + # Loop over all transitions and compute lookbacks and includes + for state,N in trans: + lookb = [] + includes = [] + for p in C[state]: + if p.name != N: continue + + # Okay, we have a name match. We now follow the production all the way + # through the state machine until we get the . on the right hand side + + lr_index = p.lr_index + j = state + while lr_index < p.len - 1: + lr_index = lr_index + 1 + t = p.prod[lr_index] + + # Check to see if this symbol and state are a non-terminal transition + if (j,t) in dtrans: + # Yes. Okay, there is some chance that this is an includes relation + # the only way to know for certain is whether the rest of the + # production derives empty + + li = lr_index + 1 + while li < p.len: + if p.prod[li] in self.grammar.Terminals: break # No forget it + if not p.prod[li] in nullable: break + li = li + 1 + else: + # Appears to be a relation between (j,t) and (state,N) + includes.append((j,t)) + + g = self.lr0_goto(C[j],t) # Go to next set + j = self.lr0_cidhash.get(id(g),-1) # Go to next state + + # When we get here, j is the final state, now we have to locate the production + for r in C[j]: + if r.name != p.name: continue + if r.len != p.len: continue + i = 0 + # This look is comparing a production ". A B C" with "A B C ." + while i < r.lr_index: + if r.prod[i] != p.prod[i+1]: break + i = i + 1 + else: + lookb.append((j,r)) + for i in includes: + if not i in includedict: includedict[i] = [] + includedict[i].append((state,N)) + lookdict[(state,N)] = lookb + + return lookdict,includedict + + # ----------------------------------------------------------------------------- + # compute_read_sets() + # + # Given a set of LR(0) items, this function computes the read sets. + # + # Inputs: C = Set of LR(0) items + # ntrans = Set of nonterminal transitions + # nullable = Set of empty transitions + # + # Returns a set containing the read sets + # ----------------------------------------------------------------------------- + + def compute_read_sets(self,C, ntrans, nullable): + FP = lambda x: self.dr_relation(C,x,nullable) + R = lambda x: self.reads_relation(C,x,nullable) + F = digraph(ntrans,R,FP) + return F + + # ----------------------------------------------------------------------------- + # compute_follow_sets() + # + # Given a set of LR(0) items, a set of non-terminal transitions, a readset, + # and an include set, this function computes the follow sets + # + # Follow(p,A) = Read(p,A) U U {Follow(p',B) | (p,A) INCLUDES (p',B)} + # + # Inputs: + # ntrans = Set of nonterminal transitions + # readsets = Readset (previously computed) + # inclsets = Include sets (previously computed) + # + # Returns a set containing the follow sets + # ----------------------------------------------------------------------------- + + def compute_follow_sets(self,ntrans,readsets,inclsets): + FP = lambda x: readsets[x] + R = lambda x: inclsets.get(x,[]) + F = digraph(ntrans,R,FP) + return F - if yaccdebug: - sys.stderr.write("yacc: Generating %s parsing table...\n" % method) - _vf.write("\n\nParsing method: %s\n\n" % method) + # ----------------------------------------------------------------------------- + # add_lookaheads() + # + # Attaches the lookahead symbols to grammar rules. + # + # Inputs: lookbacks - Set of lookback relations + # followset - Computed follow set + # + # This function directly attaches the lookaheads to productions contained + # in the lookbacks set + # ----------------------------------------------------------------------------- + + def add_lookaheads(self,lookbacks,followset): + for trans,lb in lookbacks.items(): + # Loop over productions in lookback + for state,p in lb: + if not state in p.lookaheads: + p.lookaheads[state] = [] + f = followset.get(trans,[]) + for a in f: + if a not in p.lookaheads[state]: p.lookaheads[state].append(a) + + # ----------------------------------------------------------------------------- + # add_lalr_lookaheads() + # + # This function does all of the work of adding lookahead information for use + # with LALR parsing + # ----------------------------------------------------------------------------- - # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items - # This determines the number of states + def add_lalr_lookaheads(self,C): + # Determine all of the nullable nonterminals + nullable = self.compute_nullable_nonterminals() - C = lr0_items() + # Find all non-terminal transitions + trans = self.find_nonterminal_transitions(C) - if method == 'LALR': - add_lalr_lookaheads(C) + # Compute read sets + readsets = self.compute_read_sets(C,trans,nullable) + # Compute lookback/includes relations + lookd, included = self.compute_lookback_includes(C,trans,nullable) - # Build the parser table, state by state - st = 0 - for I in C: - # Loop over each production in I - actlist = [ ] # List of actions - st_action = { } - st_actionp = { } - st_goto = { } - if yaccdebug: - _vf.write("\nstate %d\n\n" % st) + # Compute LALR FOLLOW sets + followsets = self.compute_follow_sets(trans,readsets,included) + + # Add all of the lookaheads + self.add_lookaheads(lookd,followsets) + + # ----------------------------------------------------------------------------- + # lr_parse_table() + # + # This function constructs the parse tables for SLR or LALR + # ----------------------------------------------------------------------------- + def lr_parse_table(self): + goto = self.lr_goto # Goto array + action = self.lr_action # Action array + log = self.log # Logger for output + + actionp = { } # Action production array (temporary) + + log.info("Parsing method: %s", self.lr_method) + + # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items + # This determines the number of states + + C = self.lr0_items() + + if self.lr_method == 'LALR': + self.add_lalr_lookaheads(C) + + # Build the parser table, state by state + st = 0 + for I in C: + # Loop over each production in I + actlist = [ ] # List of actions + st_action = { } + st_actionp = { } + st_goto = { } + log.info("") + log.info("state %d", st) + log.info("") for p in I: - _vf.write(" (%d) %s\n" % (p.number, str(p))) - _vf.write("\n") + log.info(" (%d) %s", p.number, str(p)) + log.info("") - for p in I: - try: - if p.len == p.lr_index + 1: - if p.name == "S'": - # Start symbol. Accept! - st_action["$end"] = 0 - st_actionp["$end"] = p - else: - # We are at the end of a production. Reduce! - if method == 'LALR': - laheads = p.lookaheads[st] + for p in I: + if p.len == p.lr_index + 1: + if p.name == "S'": + # Start symbol. Accept! + st_action["$end"] = 0 + st_actionp["$end"] = p else: - laheads = Follow[p.name] - for a in laheads: - actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p))) - r = st_action.get(a,None) - if r is not None: - # Whoa. Have a shift/reduce or reduce/reduce conflict - if r > 0: - # Need to decide on shift or reduce here - # By default we favor shifting. Need to add - # some precedence rules here. - sprec,slevel = Productions[st_actionp[a].number].prec - rprec,rlevel = Precedence.get(a,('right',0)) - if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): - # We really need to reduce here. - st_action[a] = -p.number - st_actionp[a] = p - if not slevel and not rlevel: - _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) - n_srconflict += 1 - elif (slevel == rlevel) and (rprec == 'nonassoc'): - st_action[a] = None - else: - # Hmmm. Guess we'll keep the shift - if not rlevel: - _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) - n_srconflict +=1 - elif r < 0: - # Reduce/reduce conflict. In this case, we favor the rule - # that was defined first in the grammar file - oldp = Productions[-r] - pp = Productions[p.number] - if oldp.line > pp.line: - st_action[a] = -p.number - st_actionp[a] = p - # sys.stderr.write("Reduce/reduce conflict in state %d\n" % st) - n_rrconflict += 1 - _vfc.write("reduce/reduce conflict in state %d resolved using rule %d (%s).\n" % (st, st_actionp[a].number, st_actionp[a])) - _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d (%s).\n" % (a,st_actionp[a].number, st_actionp[a])) - else: - sys.stderr.write("Unknown conflict in state %d\n" % st) + # We are at the end of a production. Reduce! + if self.lr_method == 'LALR': + laheads = p.lookaheads[st] else: - st_action[a] = -p.number - st_actionp[a] = p - else: - i = p.lr_index - a = p.prod[i+1] # Get symbol right after the "." - if a in Terminals: - g = lr0_goto(I,a) - j = _lr0_cidhash.get(id(g),-1) - if j >= 0: - # We are in a shift state - actlist.append((a,p,"shift and go to state %d" % j)) - r = st_action.get(a,None) - if r is not None: - # Whoa have a shift/reduce or shift/shift conflict - if r > 0: - if r != j: - sys.stderr.write("Shift/shift conflict in state %d\n" % st) - elif r < 0: - # Do a precedence check. - # - if precedence of reduce rule is higher, we reduce. - # - if precedence of reduce is same and left assoc, we reduce. - # - otherwise we shift - rprec,rlevel = Productions[st_actionp[a].number].prec - sprec,slevel = Precedence.get(a,('right',0)) - if (slevel > rlevel) or ((slevel == rlevel) and (rprec == 'right')): - # We decide to shift here... highest precedence to shift - st_action[a] = j - st_actionp[a] = p - if not rlevel: - n_srconflict += 1 - _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) - elif (slevel == rlevel) and (rprec == 'nonassoc'): - st_action[a] = None + laheads = self.grammar.Follow[p.name] + for a in laheads: + actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p))) + r = st_action.get(a,None) + if r is not None: + # Whoa. Have a shift/reduce or reduce/reduce conflict + if r > 0: + # Need to decide on shift or reduce here + # By default we favor shifting. Need to add + # some precedence rules here. + sprec,slevel = self.grammar.Productions[st_actionp[a].number].prec + rprec,rlevel = self.grammar.Precedence.get(a,('right',0)) + if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): + # We really need to reduce here. + st_action[a] = -p.number + st_actionp[a] = p + if not slevel and not rlevel: + log.info(" ! shift/reduce conflict for %s resolved as reduce",a) + self.sr_conflicts.append((st,a,'reduce')) + elif (slevel == rlevel) and (rprec == 'nonassoc'): + st_action[a] = None + else: + # Hmmm. Guess we'll keep the shift + if not rlevel: + log.info(" ! shift/reduce conflict for %s resolved as shift",a) + self.sr_conflicts.append((st,a,'shift')) + elif r < 0: + # Reduce/reduce conflict. In this case, we favor the rule + # that was defined first in the grammar file + oldp = self.grammar.Productions[-r] + pp = self.grammar.Productions[p.number] + if oldp.line > pp.line: + st_action[a] = -p.number + st_actionp[a] = p + chosenp,rejectp = pp,oldp + else: + chosenp,rejectp = oldp,pp + self.rr_conflicts.append((st,chosenp,rejectp)) + log.info(" ! reduce/reduce conflict for %s resolved using rule %d (%s)", a,st_actionp[a].number, st_actionp[a]) else: - # Hmmm. Guess we'll keep the reduce - if not slevel and not rlevel: - n_srconflict +=1 - _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) - _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) - + raise LALRError("Unknown conflict in state %d" % st) else: - sys.stderr.write("Unknown conflict in state %d\n" % st) - else: - st_action[a] = j - st_actionp[a] = p + st_action[a] = -p.number + st_actionp[a] = p + else: + i = p.lr_index + a = p.prod[i+1] # Get symbol right after the "." + if a in self.grammar.Terminals: + g = self.lr0_goto(I,a) + j = self.lr0_cidhash.get(id(g),-1) + if j >= 0: + # We are in a shift state + actlist.append((a,p,"shift and go to state %d" % j)) + r = st_action.get(a,None) + if r is not None: + # Whoa have a shift/reduce or shift/shift conflict + if r > 0: + if r != j: + raise LALRError("Shift/shift conflict in state %d" % st) + elif r < 0: + # Do a precedence check. + # - if precedence of reduce rule is higher, we reduce. + # - if precedence of reduce is same and left assoc, we reduce. + # - otherwise we shift + rprec,rlevel = self.grammar.Productions[st_actionp[a].number].prec + sprec,slevel = self.grammar.Precedence.get(a,('right',0)) + if (slevel > rlevel) or ((slevel == rlevel) and (rprec == 'right')): + # We decide to shift here... highest precedence to shift + st_action[a] = j + st_actionp[a] = p + if not rlevel: + log.info(" ! shift/reduce conflict for %s resolved as shift",a) + self.sr_conflicts.append((st,a,'shift')) + elif (slevel == rlevel) and (rprec == 'nonassoc'): + st_action[a] = None + else: + # Hmmm. Guess we'll keep the reduce + if not slevel and not rlevel: + log.info(" ! shift/reduce conflict for %s resolved as reduce",a) + self.sr_conflicts.append((st,a,'reduce')) - except Exception: - sys.stdout.write(str(sys.exc_info()) + "\n") - raise YaccError("Hosed in lr_parse_table") - - # Print the actions associated with each terminal - if yaccdebug: - _actprint = { } - for a,p,m in actlist: - if a in st_action: - if p is st_actionp[a]: - _vf.write(" %-15s %s\n" % (a,m)) - _actprint[(a,m)] = 1 - _vf.write("\n") - for a,p,m in actlist: - if a in st_action: - if p is not st_actionp[a]: - if not (a,m) in _actprint: - _vf.write(" ! %-15s [ %s ]\n" % (a,m)) + else: + raise LALRError("Unknown conflict in state %d" % st) + else: + st_action[a] = j + st_actionp[a] = p + + # Print the actions associated with each terminal + _actprint = { } + for a,p,m in actlist: + if a in st_action: + if p is st_actionp[a]: + log.info(" %-15s %s",a,m) _actprint[(a,m)] = 1 + log.info("") + # Print the actions that were not used. (debugging) + not_used = 0 + for a,p,m in actlist: + if a in st_action: + if p is not st_actionp[a]: + if not (a,m) in _actprint: + log.debug(" ! %-15s [ %s ]",a,m) + not_used = 1 + _actprint[(a,m)] = 1 + if not_used: + log.debug("") + + # Construct the goto table for this state + + nkeys = { } + for ii in I: + for s in ii.usyms: + if s in self.grammar.Nonterminals: + nkeys[s] = None + for n in nkeys: + g = self.lr0_goto(I,n) + j = self.lr0_cidhash.get(id(g),-1) + if j >= 0: + st_goto[n] = j + log.info(" %-30s shift and go to state %d",n,j) + + action[st] = st_action + actionp[st] = st_actionp + goto[st] = st_goto + st += 1 + + + # ----------------------------------------------------------------------------- + # write() + # + # This function writes the LR parsing tables to a file + # ----------------------------------------------------------------------------- - # Construct the goto table for this state - if yaccdebug: - _vf.write("\n") - nkeys = { } - for ii in I: - for s in ii.usyms: - if s in Nonterminals: - nkeys[s] = None - for n in nkeys: - g = lr0_goto(I,n) - j = _lr0_cidhash.get(id(g),-1) - if j >= 0: - st_goto[n] = j - if yaccdebug: - _vf.write(" %-30s shift and go to state %d\n" % (n,j)) - - action[st] = st_action - actionp[st] = st_actionp - goto[st] = st_goto - - st += 1 - - if yaccdebug: - if n_srconflict == 1: - sys.stderr.write("yacc: %d shift/reduce conflict\n" % n_srconflict) - if n_srconflict > 1: - sys.stderr.write("yacc: %d shift/reduce conflicts\n" % n_srconflict) - if n_rrconflict == 1: - sys.stderr.write("yacc: %d reduce/reduce conflict\n" % n_rrconflict) - if n_rrconflict > 1: - sys.stderr.write("yacc: %d reduce/reduce conflicts\n" % n_rrconflict) - -# ----------------------------------------------------------------------------- -# ==== LR Utility functions ==== -# ----------------------------------------------------------------------------- - -# ----------------------------------------------------------------------------- -# _lr_write_tables() -# -# This function writes the LR parsing tables to a file -# ----------------------------------------------------------------------------- - -def lr_write_tables(modulename=tab_module,outputdir=''): - if isinstance(modulename, types.ModuleType): - sys.stderr.write("Warning module %s is inconsistent with the grammar (ignored)\n" % modulename) - return - - basemodulename = modulename.split(".")[-1] - filename = os.path.join(outputdir,basemodulename) + ".py" - try: - f = open(filename,"w") + def write_table(self,modulename,outputdir='',signature=""): + basemodulename = modulename.split(".")[-1] + filename = os.path.join(outputdir,basemodulename) + ".py" + try: + f = open(filename,"w") - f.write(""" + f.write(""" # %s # This file is automatically generated. Do not edit. +_tabversion = %r -_lr_method = %s - -_lr_signature = %s -""" % (filename, repr(_lr_method), repr(Signature.digest()))) - - # Change smaller to 0 to go back to original tables - smaller = 1 +_lr_method = %r - # Factor out names to try and make smaller - if smaller: - items = { } +_lr_signature = %r + """ % (filename, __tabversion__, self.lr_method, signature)) - for s,nd in _lr_action.items(): - for name,v in nd.items(): - i = items.get(name) - if not i: - i = ([],[]) - items[name] = i - i[0].append(s) - i[1].append(v) + # Change smaller to 0 to go back to original tables + smaller = 1 - f.write("\n_lr_action_items = {") - for k,v in items.items(): - f.write("%r:([" % k) - for i in v[0]: - f.write("%r," % i) - f.write("],[") - for i in v[1]: - f.write("%r," % i) - - f.write("]),") - f.write("}\n") - - f.write(""" + # Factor out names to try and make smaller + if smaller: + items = { } + + for s,nd in self.lr_action.items(): + for name,v in nd.items(): + i = items.get(name) + if not i: + i = ([],[]) + items[name] = i + i[0].append(s) + i[1].append(v) + + f.write("\n_lr_action_items = {") + for k,v in items.items(): + f.write("%r:([" % k) + for i in v[0]: + f.write("%r," % i) + f.write("],[") + for i in v[1]: + f.write("%r," % i) + + f.write("]),") + f.write("}\n") + + f.write(""" _lr_action = { } for _k, _v in _lr_action_items.items(): for _x,_y in zip(_v[0],_v[1]): @@ -2575,38 +2531,38 @@ def lr_write_tables(modulename=tab_module,outputdir=''): del _lr_action_items """) - else: - f.write("\n_lr_action = { "); - for k,v in _lr_action.items(): - f.write("(%r,%r):%r," % (k[0],k[1],v)) - f.write("}\n"); - - if smaller: - # Factor out names to try and make smaller - items = { } - - for s,nd in _lr_goto.items(): - for name,v in nd.items(): - i = items.get(name) - if not i: - i = ([],[]) - items[name] = i - i[0].append(s) - i[1].append(v) - - f.write("\n_lr_goto_items = {") - for k,v in items.items(): - f.write("%r:([" % k) - for i in v[0]: - f.write("%r," % i) - f.write("],[") - for i in v[1]: - f.write("%r," % i) - - f.write("]),") - f.write("}\n") - - f.write(""" + else: + f.write("\n_lr_action = { "); + for k,v in self.lr_action.items(): + f.write("(%r,%r):%r," % (k[0],k[1],v)) + f.write("}\n"); + + if smaller: + # Factor out names to try and make smaller + items = { } + + for s,nd in self.lr_goto.items(): + for name,v in nd.items(): + i = items.get(name) + if not i: + i = ([],[]) + items[name] = i + i[0].append(s) + i[1].append(v) + + f.write("\n_lr_goto_items = {") + for k,v in items.items(): + f.write("%r:([" % k) + for i in v[0]: + f.write("%r," % i) + f.write("],[") + for i in v[1]: + f.write("%r," % i) + + f.write("]),") + f.write("}\n") + + f.write(""" _lr_goto = { } for _k, _v in _lr_goto_items.items(): for _x,_y in zip(_v[0],_v[1]): @@ -2614,318 +2570,585 @@ def lr_write_tables(modulename=tab_module,outputdir=''): _lr_goto[_x][_k] = _y del _lr_goto_items """) - else: - f.write("\n_lr_goto = { "); - for k,v in _lr_goto.items(): - f.write("(%r,%r):%r," % (k[0],k[1],v)) - f.write("}\n"); - - # Write production table - f.write("_lr_productions = [\n") - for p in Productions: - if p: - if (p.func): - f.write(" (%r,%d,%r,%r,%d),\n" % (p.name, p.len, p.func.__name__,p.file,p.line)) - else: - f.write(" (%r,%d,None,None,None),\n" % (p.name, p.len)) - else: - f.write(" None,\n") - f.write("]\n") - - f.close() - - except IOError: - e = sys.exc_info()[1] - sys.stderr.write("Unable to create '%s'\n" % filename) - sys.stderr.write(str(e)+"\n") - return - -def lr_read_tables(module=tab_module,optimize=0): - global _lr_action, _lr_goto, _lr_productions, _lr_method - parsetab = None - try: - if isinstance(module,types.ModuleType): - parsetab = module - else: - if sys.version_info[0] < 3: - exec("import %s as parsetab" % module) else: - env = { } - exec("import %s as parsetab" % module, env, env) - parsetab = env['parsetab'] - - if (optimize) or (Signature.digest() == parsetab._lr_signature): - _lr_action = parsetab._lr_action - _lr_goto = parsetab._lr_goto - _lr_productions = parsetab._lr_productions - _lr_method = parsetab._lr_method - return 1 - else: - return 0 + f.write("\n_lr_goto = { "); + for k,v in self.lr_goto.items(): + f.write("(%r,%r):%r," % (k[0],k[1],v)) + f.write("}\n"); + + # Write production table + f.write("_lr_productions = [\n") + for p in self.lr_productions: + if p.func: + f.write(" (%r,%r,%d,%r,%r,%d),\n" % (p.str,p.name, p.len, p.func,p.file,p.line)) + else: + f.write(" (%r,%r,%d,None,None,None),\n" % (str(p),p.name, p.len)) + f.write("]\n") + f.close() - except (ImportError,AttributeError): - return 0 + except IOError: + e = sys.exc_info()[1] + sys.stderr.write("Unable to create '%s'\n" % filename) + sys.stderr.write(str(e)+"\n") + return # ----------------------------------------------------------------------------- -# yacc(module) +# === INTROSPECTION === # -# Build the parser module +# The following functions and classes are used to implement the PLY +# introspection features followed by the yacc() function itself. # ----------------------------------------------------------------------------- -def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, start=None, check_recursion=1, optimize=0,write_tables=1,debugfile=debug_file,outputdir=''): - global yaccdebug - yaccdebug = debug - - initialize_vars() - files = { } - error = 0 - - - # Add parsing method to signature - Signature_update(method) - - # If a "module" parameter was supplied, extract its dictionary. - # Note: a module may in fact be an instance as well. +# ----------------------------------------------------------------------------- +# get_caller_module_dict() +# +# This function returns a dictionary containing all of the symbols defined within +# a caller further down the call stack. This is used to get the environment +# associated with the yacc() call if none was provided. +# ----------------------------------------------------------------------------- - if module: - _items = [(k,getattr(module,k)) for k in dir(module)] - ldict = { } - for i in _items: - ldict[i[0]] = i[1] - else: - # No module given. We might be able to get information from the caller. - # Throw an exception and unwind the traceback to get the globals +def get_caller_module_dict(levels): + try: + raise RuntimeError + except RuntimeError: + e,b,t = sys.exc_info() + f = t.tb_frame + while levels > 0: + f = f.f_back # Walk out to our calling function + levels -= 1 + ldict = f.f_globals.copy() + if f.f_globals != f.f_locals: + ldict.update(f.f_locals) + + return ldict +# ----------------------------------------------------------------------------- +# parse_grammar() +# +# This takes a raw grammar rule string and parses it into production data +# ----------------------------------------------------------------------------- +def parse_grammar(doc,file,line): + grammar = [] + # Split the doc string into lines + pstrings = doc.splitlines() + lastp = None + dline = line + for ps in pstrings: + dline += 1 + p = ps.split() + if not p: continue try: - raise RuntimeError - except RuntimeError: - e,b,t = sys.exc_info() - f = t.tb_frame - f = f.f_back # Walk out to our calling function - if f.f_globals is f.f_locals: # Collect global and local variations from caller - ldict = f.f_globals + if p[0] == '|': + # This is a continuation of a previous rule + if not lastp: + raise SyntaxError("%s:%d: Misplaced '|'" % (file,dline)) + prodname = lastp + syms = p[1:] else: - ldict = f.f_globals.copy() - ldict.update(f.f_locals) + prodname = p[0] + lastp = prodname + syms = p[2:] + assign = p[1] + if assign != ':' and assign != '::=': + raise SyntaxError("%s:%d: Syntax error. Expected ':'" % (file,dline)) + + grammar.append((file,dline,prodname,syms)) + except SyntaxError: + raise + except Exception: + raise SyntaxError("%s:%d: Syntax error in rule '%s'" % (file,dline,ps.strip())) - # Add starting symbol to signature - if not start: - start = ldict.get("start",None) - if start: - Signature_update(start) + return grammar - # Look for error handler - ef = ldict.get('p_error',None) - if ef: - if isinstance(ef,types.FunctionType): - ismethod = 0 - elif isinstance(ef, types.MethodType): - ismethod = 1 +# ----------------------------------------------------------------------------- +# ParserReflect() +# +# This class represents information extracted for building a parser including +# start symbol, error function, tokens, precedence list, action functions, +# etc. +# ----------------------------------------------------------------------------- +class ParserReflect(object): + def __init__(self,pdict,log=None): + self.pdict = pdict + self.start = None + self.error_func = None + self.tokens = None + self.files = {} + self.grammar = [] + self.error = 0 + + if log is None: + self.log = PlyLogger(sys.stderr) else: - raise YaccError("'p_error' defined, but is not a function or method.") - eline = func_code(ef).co_firstlineno - efile = func_code(ef).co_filename - files[efile] = None - - if (func_code(ef).co_argcount != 1+ismethod): - raise YaccError("%s:%d: p_error() requires 1 argument." % (efile,eline)) - global Errorfunc - Errorfunc = ef - else: - sys.stderr.write("yacc: Warning. no p_error() function is defined.\n") - - # If running in optimized mode. We're going to read tables instead - - if (optimize and lr_read_tables(tabmodule,1)): - # Read parse table - del Productions[:] - for p in _lr_productions: - if not p: - Productions.append(None) - else: - m = MiniProduction() - m.name = p[0] - m.len = p[1] - m.file = p[3] - m.line = p[4] - if p[2]: - m.func = ldict[p[2]] - Productions.append(m) - - else: - # Get the tokens map - tokens = ldict.get("tokens",None) - if not tokens: - raise YaccError("module does not define a list 'tokens'") - if not isinstance(tokens,(list, tuple)): - raise YaccError("tokens must be a list or tuple.") + self.log = log + + # Get all of the basic information + def get_all(self): + self.get_start() + self.get_error_func() + self.get_tokens() + self.get_precedence() + self.get_pfunctions() + + # Validate all of the information + def validate_all(self): + self.validate_start() + self.validate_error_func() + self.validate_tokens() + self.validate_precedence() + self.validate_pfunctions() + self.validate_files() + return self.error + + # Compute a signature over the grammar + def signature(self): + from binascii import crc32 + sig = 0 + try: + if self.start: + sig = crc32(self.start.encode('latin-1'),sig) + if self.prec: + sig = crc32("".join(["".join(p) for p in self.prec]).encode('latin-1'),sig) + if self.tokens: + sig = crc32(" ".join(self.tokens).encode('latin-1'),sig) + for f in self.pfuncs: + if f[3]: + sig = crc32(f[3].encode('latin-1'),sig) + except (TypeError,ValueError): + pass + return sig & 0xffffffff - # Check to see if a requires dictionary is defined. - requires = ldict.get("require",None) - if requires: - if not (isinstance(requires,dict)): - raise YaccError("require must be a dictionary.") + # ----------------------------------------------------------------------------- + # validate_file() + # + # This method checks to see if there are duplicated p_rulename() functions + # in the parser module file. Without this function, it is really easy for + # users to make mistakes by cutting and pasting code fragments (and it's a real + # bugger to try and figure out why the resulting parser doesn't work). Therefore, + # we just do a little regular expression pattern matching of def statements + # to try and detect duplicates. + # ----------------------------------------------------------------------------- + + def validate_files(self): + # Match def p_funcname( + fre = re.compile(r'\s*def\s+(p_[a-zA-Z_0-9]*)\(') + + for filename in self.files.keys(): + base,ext = os.path.splitext(filename) + if ext != '.py': return 1 # No idea. Assume it's okay. - for r,v in requires.items(): - try: - if not isinstance(v,list): - raise TypeError - v1 = [x.split(".") for x in v] - Requires[r] = v1 - except Exception: - sys.stderr.write("Invalid specification for rule '%s' in require. Expected a list of strings\n" % r) + try: + f = open(filename) + lines = f.readlines() + f.close() + except IOError: + continue + counthash = { } + for linen,l in enumerate(lines): + linen += 1 + m = fre.match(l) + if m: + name = m.group(1) + prev = counthash.get(name) + if not prev: + counthash[name] = linen + else: + self.log.warning("%s:%d: Function %s redefined. Previously defined on line %d", filename,linen,name,prev) - # Build the dictionary of terminals. We a record a 0 in the - # dictionary to track whether or not a terminal is actually - # used in the grammar + # Get the start symbol + def get_start(self): + self.start = self.pdict.get('start') - if 'error' in tokens: - sys.stderr.write("yacc: Illegal token 'error'. Is a reserved word.\n") - raise YaccError("Illegal token name") + # Validate the start symbol + def validate_start(self): + if self.start is not None: + if not isinstance(self.start,str): + self.log.error("'start' must be a string") - for n in tokens: - if n in Terminals: - sys.stderr.write("yacc: Warning. Token '%s' multiply defined.\n" % n) - Terminals[n] = [ ] + # Look for error handler + def get_error_func(self): + self.error_func = self.pdict.get('p_error') + + # Validate the error function + def validate_error_func(self): + if self.error_func: + if isinstance(self.error_func,types.FunctionType): + ismethod = 0 + elif isinstance(self.error_func, types.MethodType): + ismethod = 1 + else: + self.log.error("'p_error' defined, but is not a function or method") + self.error = 1 + return - Terminals['error'] = [ ] + eline = func_code(self.error_func).co_firstlineno + efile = func_code(self.error_func).co_filename + self.files[efile] = 1 - # Get the precedence map (if any) - prec = ldict.get("precedence",None) - if prec: - if not isinstance(prec,(list,tuple)): - raise YaccError("precedence must be a list or tuple.") - add_precedence(prec) - Signature_update(repr(prec)) + if (func_code(self.error_func).co_argcount != 1+ismethod): + self.log.error("%s:%d: p_error() requires 1 argument",efile,eline) + self.error = 1 - for n in tokens: - if not n in Precedence: - Precedence[n] = ('right',0) # Default, right associative, 0 precedence + # Get the tokens map + def get_tokens(self): + tokens = self.pdict.get("tokens",None) + if not tokens: + self.log.error("No token list is defined") + self.error = 1 + return - # Get the list of built-in functions with p_ prefix - symbols = [ldict[f] for f in ldict - if (type(ldict[f]) in (types.FunctionType, types.MethodType) and ldict[f].__name__[:2] == 'p_' - and ldict[f].__name__ != 'p_error')] + if not isinstance(tokens,(list, tuple)): + self.log.error("tokens must be a list or tuple") + self.error = 1 + return + + if not tokens: + self.log.error("tokens is empty") + self.error = 1 + return + + self.tokens = tokens + + # Validate the tokens + def validate_tokens(self): + # Validate the tokens. + if 'error' in self.tokens: + self.log.error("Illegal token name 'error'. Is a reserved word") + self.error = 1 + return + + terminals = {} + for n in self.tokens: + if n in terminals: + self.log.warning("Token '%s' multiply defined", n) + terminals[n] = 1 + + # Get the precedence map (if any) + def get_precedence(self): + self.prec = self.pdict.get("precedence",None) + + # Validate and parse the precedence map + def validate_precedence(self): + preclist = [] + if self.prec: + if not isinstance(self.prec,(list,tuple)): + self.log.error("precedence must be a list or tuple") + self.error = 1 + return + for level,p in enumerate(self.prec): + if not isinstance(p,(list,tuple)): + self.log.error("Bad precedence table") + self.error = 1 + return + if len(p) < 2: + self.log.error("Malformed precedence entry %s. Must be (assoc, term, ..., term)",p) + self.error = 1 + return + assoc = p[0] + if not isinstance(assoc,str): + self.log.error("precedence associativity must be a string") + self.error = 1 + return + for term in p[1:]: + if not isinstance(term,str): + self.log.error("precedence items must be strings") + self.error = 1 + return + preclist.append((term,assoc,level+1)) + self.preclist = preclist + + # Get all p_functions from the grammar + def get_pfunctions(self): + p_functions = [] + for name, item in self.pdict.items(): + if name[:2] != 'p_': continue + if name == 'p_error': continue + if isinstance(item,(types.FunctionType,types.MethodType)): + line = func_code(item).co_firstlineno + file = func_code(item).co_filename + p_functions.append((line,file,name,item.__doc__)) + + # Sort all of the actions by line number + p_functions.sort() + self.pfuncs = p_functions + + + # Validate all of the p_functions + def validate_pfunctions(self): + grammar = [] # Check for non-empty symbols - if len(symbols) == 0: - raise YaccError("no rules of the form p_rulename are defined.") - - # Sort the symbols by line number - if sys.version_info[0] < 3: - symbols.sort(lambda x,y: cmp(func_code(x).co_firstlineno,func_code(y).co_firstlineno)) - else: - # Python 3 - symbols.sort(key=lambda x: func_code(x).co_firstlineno) - - # Add all of the symbols to the grammar - for f in symbols: - if (add_function(f)) < 0: - error += 1 + if len(self.pfuncs) == 0: + self.log.error("no rules of the form p_rulename are defined") + self.error = 1 + return + + for line, file, name, doc in self.pfuncs: + func = self.pdict[name] + if isinstance(func, types.MethodType): + reqargs = 2 else: - files[func_code(f).co_filename] = None - - # Make a signature of the docstrings - for f in symbols: - if f.__doc__: - Signature_update(f.__doc__) - - lr_init_vars() - - if error: - raise YaccError("Unable to construct parser.") - - if not lr_read_tables(tabmodule): - - # Validate files - for filename in files: - if not validate_file(filename): - error = 1 - - # Validate dictionary - validate_dict(ldict) - - if start and not start in Prodnames: - raise YaccError("Bad starting symbol '%s'" % start) + reqargs = 1 + if func_code(func).co_argcount > reqargs: + self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,func.__name__) + self.error = 1 + elif func_code(func).co_argcount < reqargs: + self.log.error("%s:%d: Rule '%s' requires an argument",file,line,func.__name__) + self.error = 1 + elif not func.__doc__: + self.log.warning("%s:%d: No documentation string specified in function '%s' (ignored)",file,line,func.__name__) + else: + try: + parsed_g = parse_grammar(doc,file,line) + for g in parsed_g: + grammar.append((name, g)) + except SyntaxError: + e = sys.exc_info()[1] + self.log.error(str(e)) + self.error = 1 + + # Looks like a valid grammar rule + # Mark the file in which defined. + self.files[file] = 1 + + # Secondary validation step that looks for p_ definitions that are not functions + # or functions that look like they might be grammar rules. + + for n,v in self.pdict.items(): + if n[0:2] == 'p_' and isinstance(v, (types.FunctionType, types.MethodType)): continue + if n[0:2] == 't_': continue + if n[0:2] == 'p_' and n != 'p_error': + self.log.warning("'%s' not defined as a function", n) + if ((isinstance(v,types.FunctionType) and func_code(v).co_argcount == 1) or + (isinstance(v,types.MethodType) and func_code(v).co_argcount == 2)): + try: + doc = v.__doc__.split(" ") + if doc[1] == ':': + self.log.warning("%s:%d: Possible grammar rule '%s' defined without p_ prefix", + func_code(v).co_filename, func_code(v).co_firstlineno,n) + except Exception: + pass - augment_grammar(start) - error = verify_productions(cycle_check=check_recursion) - otherfunc = [ldict[f] for f in ldict - if (type(f) in (types.FunctionType,types.MethodType) and ldict[f].__name__[:2] != 'p_')] + self.grammar = grammar - # Check precedence rules - if check_precedence(): - error = 1 +# ----------------------------------------------------------------------------- +# yacc(module) +# +# Build a parser +# ----------------------------------------------------------------------------- - if error: - raise YaccError("Unable to construct parser.") +def yacc(method='LALR', debug=yaccdebug, module=None, tabmodule=tab_module, start=None, + check_recursion=1, optimize=0, write_tables=1, debugfile=debug_file,outputdir='', + debuglog=None, errorlog = None): - build_lritems() - compute_first1() - compute_follow(start) + global parse # Reference to the parsing method of the last built parser - if method in ['SLR','LALR']: - lr_parse_table(method) - else: - raise YaccError("Unknown parsing method '%s'" % method) + if errorlog is None: + errorlog = PlyLogger(sys.stderr) - if write_tables: - lr_write_tables(tabmodule,outputdir) + # Get the module dictionary used for the parser + if module: + _items = [(k,getattr(module,k)) for k in dir(module)] + pdict = dict(_items) + else: + pdict = get_caller_module_dict(2) - if yaccdebug: - try: - f = open(os.path.join(outputdir,debugfile),"w") - f.write(_vfc.getvalue()) - f.write("\n\n") - f.write(_vf.getvalue()) - f.close() - except IOError: - e = sys.exc_info()[1] - sys.stderr.write("yacc: can't create '%s' %s\n" % (debugfile,e)) + # Collect parser information from the dictionary + pinfo = ParserReflect(pdict,log=errorlog) + pinfo.get_all() - # Made it here. Create a parser object and set up its internal state. - # Set global parse() method to bound method of parser object. + if pinfo.error: + raise YaccError("Unable to build parser") - p = Parser("xyzzy") - p.productions = Productions - p.errorfunc = Errorfunc - p.action = _lr_action - p.goto = _lr_goto - p.method = _lr_method - p.require = Requires + # Check signature against table files (if any) + signature = pinfo.signature() - global parse - parse = p.parse + # Read the tables + try: + lr = LRTable() + read_signature = lr.read_table(tabmodule) + if optimize or (read_signature == signature): + try: + lr.bind_callables(pinfo.pdict) + parser = LRParser(lr,pinfo.error_func) + parse = parser.parse + return parser + except Exception: + e = sys.exc_info()[1] + errorlog.warning("There was a problem loading the table file: %s", repr(e)) + except VersionError: + e = sys.exc_info() + errorlog.warning(str(e)) + except Exception: + pass + + if debuglog is None: + if debug: + debuglog = PlyLogger(open(debugfile,"w")) + else: + debuglog = NullLogger() - global parser - parser = p + debuglog.info("Created by PLY version %s (http://www.dabeaz.com/ply)", __version__) - # Clean up all of the globals we created - if (not optimize): - yacc_cleanup() - return p -# yacc_cleanup function. Delete all of the global variables -# used during table construction + errors = 0 -def yacc_cleanup(): - global _lr_action, _lr_goto, _lr_method, _lr_goto_cache - del _lr_action, _lr_goto, _lr_method, _lr_goto_cache + # Validate the parser information + if pinfo.validate_all(): + raise YaccError("Unable to build parser") + + if not pinfo.error_func: + errorlog.warning("no p_error() function is defined") - global Productions, Prodnames, Prodmap, Terminals - global Nonterminals, First, Follow, Precedence, UsedPrecedence, LRitems - global Errorfunc, Signature, Requires + # Create a grammar object + grammar = Grammar(pinfo.tokens) - del Productions, Prodnames, Prodmap, Terminals - del Nonterminals, First, Follow, Precedence, UsedPrecedence, LRitems - del Errorfunc, Signature, Requires + # Set precedence level for terminals + for term, assoc, level in pinfo.preclist: + try: + grammar.set_precedence(term,assoc,level) + except GrammarError: + e = sys.exc_info()[1] + errorlog.warning("%s",str(e)) + + # Add productions to the grammar + for funcname, gram in pinfo.grammar: + file, line, prodname, syms = gram + try: + grammar.add_production(prodname,syms,funcname,file,line) + except GrammarError: + e = sys.exc_info()[1] + errorlog.error("%s",str(e)) + errors = 1 - global _vf, _vfc - del _vf, _vfc + # Set the grammar start symbols + try: + grammar.set_start(pinfo.start) + except GrammarError: + e = sys.exc_info()[1] + errorlog.error(str(e)) + errors = 1 + + if errors: + raise YaccError("Unable to build parser") + + # Verify the grammar structure + undefined_symbols = grammar.undefined_symbols() + for sym, prod in undefined_symbols: + errorlog.error("%s:%d: Symbol '%s' used, but not defined as a token or a rule",prod.file,prod.line,sym) + errors = 1 + + unused_terminals = grammar.unused_terminals() + if unused_terminals: + debuglog.info("") + debuglog.info("Unused terminals:") + debuglog.info("") + for term in unused_terminals: + errorlog.warning("Token '%s' defined, but not used", term) + debuglog.info(" %s", term) + + # Print out all productions to the debug log + if debug: + debuglog.info("") + debuglog.info("Grammar") + debuglog.info("") + for n,p in enumerate(grammar.Productions): + debuglog.info("Rule %-5d %s", n+1, p) + + # Find unused non-terminals + unused_rules = grammar.unused_rules() + for prod in unused_rules: + errorlog.warning("%s:%d: Rule '%s' defined, but not used", prod.file, prod.line, prod.name) + + if len(unused_terminals) == 1: + errorlog.warning("There is 1 unused token") + if len(unused_terminals) > 1: + errorlog.warning("There are %d unused tokens", len(unused_terminals)) + + if len(unused_rules) == 1: + errorlog.warning("There is 1 unused rule") + if len(unused_rules) > 1: + errorlog.warning("There are %d unused rules", len(unused_rules)) + + if debug: + debuglog.info("") + debuglog.info("Terminals, with rules where they appear") + debuglog.info("") + terms = list(grammar.Terminals) + terms.sort() + for term in terms: + debuglog.info("%-20s : %s", term, " ".join([str(s) for s in grammar.Terminals[term]])) + + debuglog.info("") + debuglog.info("Nonterminals, with rules where they appear") + debuglog.info("") + nonterms = list(grammar.Nonterminals) + nonterms.sort() + for nonterm in nonterms: + debuglog.info("%-20s : %s", nonterm, " ".join([str(s) for s in grammar.Nonterminals[nonterm]])) + debuglog.info("") + + if check_recursion: + unreachable = grammar.find_unreachable() + for u in unreachable: + errorlog.warning("Symbol '%s' is unreachable",u) + + infinite = grammar.infinite_cycles() + for inf in infinite: + errorlog.error("Infinite recursion detected for symbol '%s'", inf) + errors = 1 + + unused_prec = grammar.unused_precedence() + for term, assoc in unused_prec: + errorlog.error("Precedence rule '%s' defined for unknown symbol '%s'", assoc, term) + errors = 1 + if errors: + raise YaccError("Unable to build parser") + + # Run the LRGeneratedTable on the grammar + errorlog.debug("Generating %s tables", method) + + lr = LRGeneratedTable(grammar,method,debuglog) + + num_sr = len(lr.sr_conflicts) + + # Report shift/reduce and reduce/reduce conflicts + if num_sr == 1: + errorlog.warning("1 shift/reduce conflict") + elif num_sr > 1: + errorlog.warning("%d shift/reduce conflicts", num_sr) + + num_rr = len(lr.rr_conflicts) + if num_rr == 1: + errorlog.warning("1 reduce/reduce conflict") + elif num_rr > 1: + errorlog.warning("%d reduce/reduce conflicts", num_rr) + + # Write out conflicts to the output file + if debug and (lr.sr_conflicts or lr.rr_conflicts): + debuglog.warning("") + debuglog.warning("Conflicts:") + debuglog.warning("") + + for state, tok, resolution in lr.sr_conflicts: + debuglog.warning("shift/reduce conflict for %s in state %d resolved as %s", tok, state, resolution) + + for state, rule, rejected in lr.rr_conflicts: + debuglog.warning("reduce/reduce conflict in state %d resolved using rule (%s)", state, rule) + debuglog.warning("rejected rule (%s)", rejected) + errorlog.warning("reduce/reduce conflict in state %d resolved using rule (%s)", state, rule) + errorlog.warning("rejected rule (%s)", rejected) + + # Write the table file if requested + if write_tables: + lr.write_table(tabmodule,outputdir,signature) + + # Build the parser + lr.bind_callables(pinfo.pdict) + parser = LRParser(lr,pinfo.error_func) -# Stub that raises an error if parsing is attempted without first calling yacc() -def parse(*args,**kwargs): - raise YaccError("yacc: No parser built with yacc()") + parse = parser.parse + return parser diff --git a/test/testyacc.py b/test/testyacc.py index 64d41e3..e78b097 100644 --- a/test/testyacc.py +++ b/test/testyacc.py @@ -15,7 +15,14 @@ import ply.yacc def check_expected(result,expected): - resultlines = result.splitlines() + resultlines = [] + for line in result.splitlines(): + if line.startswith("WARNING: "): + line = line[9:] + elif line.startswith("ERROR: "): + line = line[7:] + resultlines.append(line) + expectedlines = expected.splitlines() if len(resultlines) != len(expectedlines): return False @@ -47,8 +54,8 @@ def test_yacc_badargs(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_badargs") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc_badargs.py:23: Rule 'p_statement_assign' has too many arguments.\n" - "yacc_badargs.py:27: Rule 'p_statement_expr' requires an argument.\n" + "yacc_badargs.py:23: Rule 'p_statement_assign' has too many arguments\n" + "yacc_badargs.py:27: Rule 'p_statement_expr' requires an argument\n" )) def test_yacc_badid(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_badid") @@ -62,24 +69,24 @@ def test_yacc_badprec(self): try: run_import("yacc_badprec") except ply.yacc.YaccError: - e = sys.exc_info()[1] - self.assert_(check_expected(str(e), - "precedence must be a list or tuple.")) + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "precedence must be a list or tuple\n" + )) def test_yacc_badprec2(self): - run_import("yacc_badprec2") + self.assertRaises(ply.yacc.YaccError,run_import,"yacc_badprec2") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc: Invalid precedence table.\n" - "yacc: Generating LALR parsing table...\n" - "yacc: 8 shift/reduce conflicts\n" + "Bad precedence table\n" )) def test_yacc_badprec3(self): run_import("yacc_badprec3") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc: Precedence already specified for terminal 'MINUS'\n" - "yacc: Generating LALR parsing table...\n" + "Precedence already specified for terminal 'MINUS'\n" + "Generating LALR tables\n" + )) def test_yacc_badrule(self): @@ -96,58 +103,60 @@ def test_yacc_badtok(self): try: run_import("yacc_badtok") except ply.yacc.YaccError: - e = sys.exc_info()[1] - self.assert_(check_expected(str(e), - "tokens must be a list or tuple.")) + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "tokens must be a list or tuple\n")) def test_yacc_dup(self): run_import("yacc_dup") result = sys.stderr.getvalue() self.assert_(check_expected(result, "yacc_dup.py:27: Function p_statement redefined. Previously defined on line 23\n" - "yacc: Warning. Token 'EQUALS' defined, but not used.\n" - "yacc: Warning. There is 1 unused token.\n" - "yacc: Generating LALR parsing table...\n" + "Token 'EQUALS' defined, but not used\n" + "There is 1 unused token\n" + "Generating LALR tables\n" + )) def test_yacc_error1(self): try: run_import("yacc_error1") except ply.yacc.YaccError: - e = sys.exc_info()[1] - self.assert_(check_expected(str(e), - "yacc_error1.py:61: p_error() requires 1 argument.")) + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_error1.py:61: p_error() requires 1 argument\n")) def test_yacc_error2(self): try: run_import("yacc_error2") except ply.yacc.YaccError: - e = sys.exc_info()[1] - self.assert_(check_expected(str(e), - "yacc_error2.py:61: p_error() requires 1 argument.")) + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "yacc_error2.py:61: p_error() requires 1 argument\n")) def test_yacc_error3(self): try: run_import("yacc_error3") except ply.yacc.YaccError: e = sys.exc_info()[1] - self.assert_(check_expected(str(e), - "'p_error' defined, but is not a function or method.")) + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "'p_error' defined, but is not a function or method\n")) def test_yacc_error4(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_error4") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc_error4.py:62: Illegal rule name 'error'. Already defined as a token.\n" + "yacc_error4.py:62: Illegal rule name 'error'. Already defined as a token\n" )) def test_yacc_inf(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_inf") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc: Warning. Token 'NUMBER' defined, but not used.\n" - "yacc: Warning. There is 1 unused token.\n" - "yacc: Infinite recursion detected for symbol 'statement'.\n" - "yacc: Infinite recursion detected for symbol 'expression'.\n" + "Token 'NUMBER' defined, but not used\n" + "There is 1 unused token\n" + "Infinite recursion detected for symbol 'statement'\n" + "Infinite recursion detected for symbol 'expression'\n" )) def test_yacc_literal(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_literal") @@ -159,14 +168,14 @@ def test_yacc_misplaced(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_misplaced") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc_misplaced.py:32: Misplaced '|'.\n" + "yacc_misplaced.py:32: Misplaced '|'\n" )) def test_yacc_missing1(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_missing1") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc_missing1.py:24: Symbol 'location' used, but not defined as a token or a rule.\n" + "yacc_missing1.py:24: Symbol 'location' used, but not defined as a token or a rule\n" )) def test_yacc_nested(self): @@ -182,92 +191,96 @@ def test_yacc_nodoc(self): run_import("yacc_nodoc") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc_nodoc.py:27: No documentation string specified in function 'p_statement_expr'\n" - "yacc: Generating LALR parsing table...\n" + "yacc_nodoc.py:27: No documentation string specified in function 'p_statement_expr' (ignored)\n" + "Generating LALR tables\n" )) def test_yacc_noerror(self): run_import("yacc_noerror") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc: Warning. no p_error() function is defined.\n" - "yacc: Generating LALR parsing table...\n" + "no p_error() function is defined\n" + "Generating LALR tables\n" )) def test_yacc_nop(self): run_import("yacc_nop") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc_nop.py:27: Warning. Possible grammar rule 'statement_expr' defined without p_ prefix.\n" - "yacc: Generating LALR parsing table...\n" + "yacc_nop.py:27: Possible grammar rule 'statement_expr' defined without p_ prefix\n" + "Generating LALR tables\n" )) def test_yacc_notfunc(self): run_import("yacc_notfunc") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc: Warning. 'p_statement_assign' not defined as a function\n" - "yacc: Warning. Token 'EQUALS' defined, but not used.\n" - "yacc: Warning. There is 1 unused token.\n" - "yacc: Generating LALR parsing table...\n" + "'p_statement_assign' not defined as a function\n" + "Token 'EQUALS' defined, but not used\n" + "There is 1 unused token\n" + "Generating LALR tables\n" )) def test_yacc_notok(self): try: run_import("yacc_notok") except ply.yacc.YaccError: - e = sys.exc_info()[1] - self.assert_(check_expected(str(e), - "module does not define a list 'tokens'")) + result = sys.stderr.getvalue() + self.assert_(check_expected(result, + "No token list is defined\n")) def test_yacc_rr(self): run_import("yacc_rr") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc: Generating LALR parsing table...\n" - "yacc: 1 reduce/reduce conflict\n" + "Generating LALR tables\n" + "1 reduce/reduce conflict\n" + "reduce/reduce conflict in state 15 resolved using rule (statement -> NAME EQUALS NUMBER)\n" + "rejected rule (expression -> NUMBER)\n" + )) def test_yacc_simple(self): run_import("yacc_simple") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc: Generating LALR parsing table...\n" + "Generating LALR tables\n" )) def test_yacc_sr(self): run_import("yacc_sr") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc: Generating LALR parsing table...\n" - "yacc: 20 shift/reduce conflicts\n" + "Generating LALR tables\n" + "20 shift/reduce conflicts\n" )) def test_yacc_term1(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_term1") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc_term1.py:24: Illegal rule name 'NUMBER'. Already defined as a token.\n" + "yacc_term1.py:24: Illegal rule name 'NUMBER'. Already defined as a token\n" )) def test_yacc_unused(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_unused") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc_unused.py:62: Symbol 'COMMA' used, but not defined as a token or a rule.\n" - "yacc: Symbol 'COMMA' is unreachable.\n" - "yacc: Symbol 'exprlist' is unreachable.\n" + "yacc_unused.py:62: Symbol 'COMMA' used, but not defined as a token or a rule\n" + "Symbol 'COMMA' is unreachable\n" + "Symbol 'exprlist' is unreachable\n" )) def test_yacc_unused_rule(self): run_import("yacc_unused_rule") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc_unused_rule.py:62: Warning. Rule 'integer' defined, but not used.\n" - "yacc: Warning. There is 1 unused rule.\n" - "yacc: Symbol 'integer' is unreachable.\n" - "yacc: Generating LALR parsing table...\n" + "yacc_unused_rule.py:62: Rule 'integer' defined, but not used\n" + "There is 1 unused rule\n" + "Symbol 'integer' is unreachable\n" + "Generating LALR tables\n" )) def test_yacc_uprec(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_uprec") result = sys.stderr.getvalue() + print repr(result) self.assert_(check_expected(result, "yacc_uprec.py:37: Nothing known about the precedence of 'UMINUS'\n" )) @@ -276,17 +289,17 @@ def test_yacc_uprec2(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_uprec2") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc_uprec2.py:37: Syntax error. Nothing follows %prec.\n" + "yacc_uprec2.py:37: Syntax error. Nothing follows %prec\n" )) def test_yacc_prec1(self): self.assertRaises(ply.yacc.YaccError,run_import,"yacc_prec1") result = sys.stderr.getvalue() self.assert_(check_expected(result, - "yacc: Precedence rule 'left' defined for unknown symbol '+'\n" - "yacc: Precedence rule 'left' defined for unknown symbol '*'\n" - "yacc: Precedence rule 'left' defined for unknown symbol '-'\n" - "yacc: Precedence rule 'left' defined for unknown symbol '/'\n" + "Precedence rule 'left' defined for unknown symbol '+'\n" + "Precedence rule 'left' defined for unknown symbol '*'\n" + "Precedence rule 'left' defined for unknown symbol '-'\n" + "Precedence rule 'left' defined for unknown symbol '/'\n" )) diff --git a/test/yacc_badid.py b/test/yacc_badid.py index 1df351c..e4b9f5e 100644 --- a/test/yacc_badid.py +++ b/test/yacc_badid.py @@ -28,7 +28,7 @@ def p_statement_expr(t): 'statement : expression' print(t[1]) -def p_statement_expr(t): +def p_statement_expr2(t): 'statement : bad&rule' pass