lib2to3.pgen3.driver.load_grammar() now creates a stable cache file

between runs given the same Grammar.txt input regardless of the hash randomization setting.
HorusLi522 · Sep 8, 2016 · dd1c638 · dd1c638
1 parent d61910c
commit dd1c638
Show file tree

Hide file tree

Showing 6 changed files with 115 additions and 18 deletions.
diff --git a/Lib/lib2to3/pgen2/driver.py b/Lib/lib2to3/pgen2/driver.py
@@ -106,16 +106,19 @@ def parse_string(self, text, debug=False):
         return self.parse_tokens(tokens, debug)
 
 
+def _generate_pickle_name(gt):
+    head, tail = os.path.splitext(gt)
+    if tail == ".txt":
+        tail = ""
+    return head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
+
+
 def load_grammar(gt="Grammar.txt", gp=None,
                  save=True, force=False, logger=None):
     """Load the grammar (maybe from a pickle)."""
     if logger is None:
         logger = logging.getLogger()
-    if gp is None:
-        head, tail = os.path.splitext(gt)
-        if tail == ".txt":
-            tail = ""
-        gp = head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
+    gp = _generate_pickle_name(gt) if gp is None else gp
     if force or not _newer(gp, gt):
         logger.info("Generating grammar tables from %s", gt)
         g = pgen.generate_grammar(gt)
@@ -124,7 +127,7 @@ def load_grammar(gt="Grammar.txt", gp=None,
             try:
                 g.dump(gp)
             except OSError as e:
-                logger.info("Writing failed:"+str(e))
+                logger.info("Writing failed: %s", e)
     else:
         g = grammar.Grammar()
         g.load(gp)

diff --git a/Lib/lib2to3/pgen2/grammar.py b/Lib/lib2to3/pgen2/grammar.py
@@ -13,6 +13,7 @@
 """
 
 # Python imports
+import collections
 import pickle
 
 # Local imports
@@ -85,9 +86,21 @@ def __init__(self):
         self.start = 256
 
     def dump(self, filename):
-        """Dump the grammar tables to a pickle file."""
+        """Dump the grammar tables to a pickle file.
+
+        dump() recursively changes all dict to OrderedDict, so the pickled file
+        is not exactly the same as what was passed in to dump(). load() uses the
+        pickled file to create the tables, but  only changes OrderedDict to dict
+        at the top level; it does not recursively change OrderedDict to dict.
+        So, the loaded tables are different from the original tables that were
+        passed to load() in that some of the OrderedDict (from the pickled file)
+        are not changed back to dict. For parsing, this has no effect on
+        performance because OrderedDict uses dict's __getitem__ with nothing in
+        between.
+        """
         with open(filename, "wb") as f:
-            pickle.dump(self.__dict__, f, 2)
+            d = _make_deterministic(self.__dict__)
+            pickle.dump(d, f, 2)
 
     def load(self, filename):
         """Load the grammar tables from a pickle file."""
@@ -124,6 +137,17 @@ def report(self):
         print("start", self.start)
 
 
+def _make_deterministic(top):
+    if isinstance(top, dict):
+      return collections.OrderedDict(
+          sorted(((k, _make_deterministic(v)) for k, v in top.items())))
+    if isinstance(top, list):
+      return [_make_deterministic(e) for e in top]
+    if isinstance(top, tuple):
+      return tuple(_make_deterministic(e) for e in top)
+    return top
+
+
 # Map from operator to number (since tokenize doesn't do this)
 
 opmap_raw = """

diff --git a/Lib/lib2to3/pgen2/pgen.py b/Lib/lib2to3/pgen2/pgen.py
@@ -39,7 +39,7 @@ def make_grammar(self):
             states = []
             for state in dfa:
                 arcs = []
-                for label, next in state.arcs.items():
+                for label, next in sorted(state.arcs.items()):
                     arcs.append((self.make_label(c, label), dfa.index(next)))
                 if state.isfinal:
                     arcs.append((0, dfa.index(state)))
@@ -52,7 +52,7 @@ def make_grammar(self):
     def make_first(self, c, name):
         rawfirst = self.first[name]
         first = {}
-        for label in rawfirst:
+        for label in sorted(rawfirst):
             ilabel = self.make_label(c, label)
             ##assert ilabel not in first # XXX failed on <> ... !=
             first[ilabel] = 1
@@ -192,7 +192,7 @@ def addclosure(state, base):
                 for label, next in nfastate.arcs:
                     if label is not None:
                         addclosure(next, arcs.setdefault(label, {}))
-            for label, nfaset in arcs.items():
+            for label, nfaset in sorted(arcs.items()):
                 for st in states:
                     if st.nfaset == nfaset:
                         break
@@ -222,7 +222,7 @@ def dump_dfa(self, name, dfa):
         print("Dump of DFA for", name)
         for i, state in enumerate(dfa):
             print("  State", i, state.isfinal and "(final)" or "")
-            for label, next in state.arcs.items():
+            for label, next in sorted(state.arcs.items()):
                 print("    %s -> %d" % (label, dfa.index(next)))
 
     def simplify_dfa(self, dfa):

diff --git a/Lib/lib2to3/tests/support.py b/Lib/lib2to3/tests/support.py
@@ -11,13 +11,13 @@
 
 # Local imports
 from lib2to3 import pytree, refactor
-from lib2to3.pgen2 import driver
+from lib2to3.pgen2 import driver as pgen2_driver
 
 test_dir = os.path.dirname(__file__)
 proj_dir = os.path.normpath(os.path.join(test_dir, ".."))
 grammar_path = os.path.join(test_dir, "..", "Grammar.txt")
-grammar = driver.load_grammar(grammar_path)
-driver = driver.Driver(grammar, convert=pytree.convert)
+grammar = pgen2_driver.load_grammar(grammar_path)
+driver = pgen2_driver.Driver(grammar, convert=pytree.convert)
 
 def parse_string(string):
     return driver.parse_string(reformat(string), debug=True)

diff --git a/Lib/lib2to3/tests/test_parser.py b/Lib/lib2to3/tests/test_parser.py
@@ -6,21 +6,22 @@
 test_grammar.py files from both Python 2 and Python 3.
 """
 
-from __future__ import with_statement
-
 # Testing imports
 from . import support
 from .support import driver, test_dir
 from test.support import verbose
 
 # Python imports
 import os
+import shutil
+import subprocess
 import sys
+import tempfile
 import unittest
 import warnings
-import subprocess
 
 # Local imports
+from lib2to3.pgen2 import driver as pgen2_driver
 from lib2to3.pgen2 import tokenize
 from ..pgen2.parse import ParseError
 from lib2to3.pygram import python_symbols as syms
@@ -35,6 +36,71 @@ def test_formfeed(self):
         self.assertEqual(t.children[1].children[0].type, syms.print_stmt)
 
 
+class TestPgen2Caching(support.TestCase):
+    def test_load_grammar_from_txt_file(self):
+        pgen2_driver.load_grammar(support.grammar_path, save=False, force=True)
+
+    def test_load_grammar_from_pickle(self):
+        # Make a copy of the grammar file in a temp directory we are
+        # guaranteed to be able to write to.
+        tmpdir = tempfile.mkdtemp()
+        try:
+            grammar_copy = os.path.join(
+                    tmpdir, os.path.basename(support.grammar_path))
+            shutil.copy(support.grammar_path, grammar_copy)
+            pickle_name = pgen2_driver._generate_pickle_name(grammar_copy)
+
+            pgen2_driver.load_grammar(grammar_copy, save=True, force=True)
+            self.assertTrue(os.path.exists(pickle_name))
+
+            os.unlink(grammar_copy)  # Only the pickle remains...
+            pgen2_driver.load_grammar(grammar_copy, save=False, force=False)
+        finally:
+            shutil.rmtree(tmpdir)
+
+    @unittest.skipIf(sys.executable is None, 'sys.executable required')
+    def test_load_grammar_from_subprocess(self):
+        tmpdir = tempfile.mkdtemp()
+        tmpsubdir = os.path.join(tmpdir, 'subdir')
+        try:
+            os.mkdir(tmpsubdir)
+            grammar_base = os.path.basename(support.grammar_path)
+            grammar_copy = os.path.join(tmpdir, grammar_base)
+            grammar_sub_copy = os.path.join(tmpsubdir, grammar_base)
+            shutil.copy(support.grammar_path, grammar_copy)
+            shutil.copy(support.grammar_path, grammar_sub_copy)
+            pickle_name = pgen2_driver._generate_pickle_name(grammar_copy)
+            pickle_sub_name = pgen2_driver._generate_pickle_name(
+                     grammar_sub_copy)
+            self.assertNotEqual(pickle_name, pickle_sub_name)
+
+            # Generate a pickle file from this process.
+            pgen2_driver.load_grammar(grammar_copy, save=True, force=True)
+            self.assertTrue(os.path.exists(pickle_name))
+
+            # Generate a new pickle file in a subprocess with a most likely
+            # different hash randomization seed.
+            sub_env = dict(os.environ)
+            sub_env['PYTHONHASHSEED'] = 'random'
+            subprocess.check_call(
+                    [sys.executable, '-c', """
+from lib2to3.pgen2 import driver as pgen2_driver
+pgen2_driver.load_grammar(%r, save=True, force=True)
+                    """ % (grammar_sub_copy,)],
+                    env=sub_env)
+            self.assertTrue(os.path.exists(pickle_sub_name))
+
+            with open(pickle_name, 'rb') as pickle_f_1, \
+                    open(pickle_sub_name, 'rb') as pickle_f_2:
+                self.assertEqual(
+                    pickle_f_1.read(), pickle_f_2.read(),
+                    msg='Grammar caches generated using different hash seeds'
+                    ' were not identical.')
+        finally:
+            shutil.rmtree(tmpdir)
+
+
+
 class GrammarTest(support.TestCase):
     def validate(self, code):
         support.parse_string(code)

diff --git a/Misc/NEWS b/Misc/NEWS
@@ -67,6 +67,10 @@ Core and Builtins
 Library
 -------
 
+- lib2to3.pgen3.driver.load_grammar() now creates a stable cache file
+  between runs given the same Grammar.txt input regardless of the hash
+  randomization setting.
+
 - Issue #27570: Avoid zero-length memcpy() etc calls with null source
   pointers in the "ctypes" and "array" modules.