Support zinc name hashing.

- Upgrade to a recent version of zinc. - Support version 5 of the analysis serialization format. - This version moves the CompileSetup sections from the end of the file to the beginning. - This version adds a "name hashing" section to the CompileSetup. - Add a pants option to turn name-hashing on. - Fixed the implementation of is_nonempty_analysis(), which can no longer simply look at a prefix (because the order of elements in the zinc analysis file has now changed). We had already added support for splitting/merging the analysis sections used by name hashing, under the assumption that their structure and semantics were the same as for their equivalent pre-name-hashing sections. We had been told by TypeSafe that this assumption was correct, but had never tested it. I have now verified that it is indeed true. Note that when name hashing is turned on, the member* and inheritance* sections are populated by zinc INSTEAD OF the direct* and public* sections. However the "used names" section is populated AS WELL AS the "class names" section. This means that turning on name hashing will cause analysis files to be larger. Whether this is significant, in particular wrt split/merge times, needs to be measured. I suspect it should be OK, since split/merge of these sections is simple - they don't have the complicated internalization/externalization logic. Testing Done: CI passes: https://travis-ci.org/pantsbuild/pants/builds/50907336 Reviewed at https://rbcommons.com/s/twitter/r/1779/
jtrobec · Feb 25, 2015 · 2d9972a · 2d9972a
1 parent 1da125c
commit 2d9972a
Show file tree

Hide file tree

Showing 11 changed files with 109 additions and 141 deletions.
diff --git a/BUILD.tools b/BUILD.tools
@@ -75,7 +75,7 @@ jar_library(name = 'benchmark-java-allocation-instrumenter-2.1',
 
 jar_library(name = 'zinc',
             jars = [
-              jar(org = 'com.typesafe.zinc', name = 'zinc', rev = '0.3.2-M1')
+              jar(org = 'com.typesafe.zinc', name = 'zinc', rev = '0.3.7')
                 .exclude(org = 'com.martiansoftware', name = 'nailgun-server')
                 .exclude(org = 'org.ensime', name = 'ensime-sbt-cmd')
             ])

diff --git a/src/python/pants/backend/jvm/tasks/jvm_compile/analysis_parser.py b/src/python/pants/backend/jvm/tasks/jvm_compile/analysis_parser.py
@@ -6,6 +6,7 @@
                         unicode_literals, with_statement)
 
 import os
+import re
 
 from pants.base.exceptions import TaskError
 
@@ -16,24 +17,29 @@ class ParseError(TaskError):
 
 class AnalysisParser(object):
   """Parse a file containing representation of an analysis for some JVM language."""
+
   def __init__(self, classes_dir):
     self.classes_dir = classes_dir  # The output dir for classes in this analysis.
 
+  @property
+  def empty_test_header(self):
+    """The header of a section that will be nonempty iff the analysis is nonempty.
+
+    We look at this section to determine whether the analysis contains any useful data.
+    """
+    raise NotImplementedError('Subclasses must implement.')
+
   def is_nonempty_analysis(self, path):
-    """Returns whether an analysis at a specified path is nontrivial."""
+    """Does the specified analysis file contain information for at least one source file."""
     if not os.path.exists(path):
       return False
-    empty_prefix = self.empty_prefix()
     with open(path, 'r') as infile:
-      prefix = infile.read(len(empty_prefix))
-    return prefix != empty_prefix
-
-  def empty_prefix(self):
-    """Returns a prefix indicating a trivial analysis file.
-
-    I.e., this prefix is present at the begnning of an analysis file iff the analysis is trivial.
-    """
-    raise NotImplementedError()
+      # Skip until we get to the section that will be nonempty iff the analysis is nonempty.
+      expected_header = '{0}:\n'.format(self.empty_test_header)
+      while infile.next() != expected_header:
+        pass
+      # Now see if this section is empty or not.
+      return self.parse_num_items(infile.next()) > 0
 
   def parse_from_path(self, infile_path):
     """Parse an analysis instance from a text file."""
@@ -82,3 +88,12 @@ def parse_deps(self, infile, classpath_indexer):
     All paths are absolute.
     """
     raise NotImplementedError()
+
+  _num_items_re = re.compile(r'(\d+) items\n')
+
+  def parse_num_items(self, line):
+    """Parse a line of the form '<num> items' and returns <num> as an int."""
+    matchobj = self._num_items_re.match(line)
+    if not matchobj:
+      raise ParseError('Expected: "<num> items". Found: "{0}"'.format(line))
+    return int(matchobj.group(1))
diff --git a/src/python/pants/backend/jvm/tasks/jvm_compile/java/jmake_analysis_parser.py b/src/python/pants/backend/jvm/tasks/jvm_compile/java/jmake_analysis_parser.py
@@ -6,7 +6,6 @@
                         unicode_literals, with_statement)
 
 import os
-import re
 from collections import defaultdict
 
 from pants.backend.jvm.tasks.jvm_compile.analysis_parser import AnalysisParser, ParseError
@@ -17,12 +16,11 @@
 class JMakeAnalysisParser(AnalysisParser):
   """Parse a file containing representation of an analysis for some JVM language."""
 
-  def empty_prefix(self):
-    return 'pcd entries:\n0 items\n'
+  empty_test_header = 'pcd entries'
 
   def parse(self, infile):
     self._expect_header(infile.readline(), 'pcd entries')
-    num_pcd_entries = self._parse_num_items(infile.readline())
+    num_pcd_entries = self.parse_num_items(infile.readline())
     pcd_entries = []
     for i in xrange(0, num_pcd_entries):
       line = infile.readline()
@@ -35,7 +33,7 @@ def parse(self, infile):
 
   def parse_products(self, infile):
     self._expect_header(infile.readline(), 'pcd entries')
-    num_pcd_entries = self._parse_num_items(infile.readline())
+    num_pcd_entries = self.parse_num_items(infile.readline())
     ret = defaultdict(list)
     # Parse more efficiently than above, since we only care about
     # the first two elements in the line.
@@ -52,7 +50,7 @@ def parse_deps(self, infile, classpath_indexer):
     buildroot = get_buildroot()
     classpath_elements_by_class = classpath_indexer()
     self._expect_header(infile.readline(), 'pcd entries')
-    num_pcd_entries = self._parse_num_items(infile.readline())
+    num_pcd_entries = self.parse_num_items(infile.readline())
     for _ in xrange(0, num_pcd_entries):
       infile.readline()  # Skip these lines.
     src_to_deps = self._parse_deps_at_position(infile)
@@ -73,7 +71,7 @@ def parse_deps(self, infile, classpath_indexer):
 
   def _parse_deps_at_position(self, infile):
     self._expect_header(infile.readline(), 'dependencies')
-    num_deps = self._parse_num_items(infile.readline())
+    num_deps = self.parse_num_items(infile.readline())
     src_to_deps = {}
     for i in xrange(0, num_deps):
       tpl = infile.readline().split('\t')
@@ -83,15 +81,6 @@ def _parse_deps_at_position(self, infile):
       src_to_deps[src] = deps
     return src_to_deps
 
-  num_items_re = re.compile(r'(\d+) items\n')
-
-  def _parse_num_items(self, line):
-    """Parse a line of the form '<num> items' and returns <num> as an int."""
-    matchobj = JMakeAnalysisParser.num_items_re.match(line)
-    if not matchobj:
-      raise ParseError('Expected: "<num> items". Found: "%s"' % line)
-    return int(matchobj.group(1))
-
   def _expect_header(self, line, header):
     expected = header + ':\n'
     if line != expected:

diff --git a/src/python/pants/backend/jvm/tasks/jvm_compile/scala/scala_compile.py b/src/python/pants/backend/jvm/tasks/jvm_compile/scala/scala_compile.py
@@ -36,6 +36,7 @@ def register_options(cls, register):
     # Note: Used in ZincUtils.
     # TODO: Revisit this. It's unintuitive for ZincUtils to reach back into the task for options.
     register('--plugins', action='append', help='Use these scalac plugins.')
+    register('--name-hashing', action='store_true', default=False, help='Use zinc name hashing.')
     ZincUtils.register_options(register, cls.register_jvm_tool)
 
   def __init__(self, *args, **kwargs):

diff --git a/src/python/pants/backend/jvm/tasks/jvm_compile/scala/zinc_analysis.py b/src/python/pants/backend/jvm/tasks/jvm_compile/scala/zinc_analysis.py
@@ -6,7 +6,6 @@
                         unicode_literals, with_statement)
 
 import itertools
-import json
 import os
 from collections import defaultdict
 
@@ -40,10 +39,6 @@ class ZincAnalysisElement(object):
   # The section names for the sections in this element. Subclasses override.
   headers = ()
 
-  @classmethod
-  def from_json_obj(cls, obj):
-    return cls([obj[header] for header in cls.headers])
-
   def __init__(self, args):
     # self.args is a list of maps from key to list of values. Each map corresponds to a
     # section in the analysis file. E.g.,
@@ -136,7 +131,7 @@ class ZincAnalysis(Analysis):
 
   # Implementation of class method required by Analysis.
 
-  FORMAT_VERSION_LINE = 'format version: 4\n'
+  FORMAT_VERSION_LINE = 'format version: 5\n'
 
   @staticmethod
   def merge_dicts(dicts):
@@ -153,6 +148,10 @@ def merge_dicts(dicts):
   def merge(cls, analyses):
     # Note: correctly handles "internalizing" external deps that must be internal post-merge.
 
+    # "Merge" compile setup. We assume that all merged analyses have the same setup, so we just take the
+    # setup of the first analysis. TODO: Validate that all analyses have the same setup.
+    compile_setup = analyses[0].compile_setup if len(analyses) > 0 else CompileSetup((defaultdict(list), ))
+
     # Merge relations.
     src_prod = ZincAnalysis.merge_dicts([a.relations.src_prod for a in analyses])
     binary_dep = ZincAnalysis.merge_dicts([a.relations.binary_dep for a in analyses])
@@ -235,38 +234,37 @@ def merge_dependencies(internals, externals):
       compilations_dict['%03d' % i] = [v]
     compilations = Compilations((compilations_dict, ))
 
-    compile_setup = analyses[0].compile_setup if len(analyses) > 0 else CompileSetup((defaultdict(list), ))
-    return ZincAnalysis(relations, stamps, apis, source_infos, compilations, compile_setup)
+    return ZincAnalysis(compile_setup, relations, stamps, apis, source_infos, compilations)
 
-  def __init__(self, relations, stamps, apis, source_infos, compilations, compile_setup):
-    (self.relations, self.stamps, self.apis, self.source_infos, self.compilations, self.compile_setup) = \
-      (relations, stamps, apis, source_infos, compilations, compile_setup)
+  def __init__(self, compile_setup, relations, stamps, apis, source_infos, compilations):
+    (self.compile_setup, self.relations, self.stamps, self.apis, self.source_infos, self.compilations) = \
+      (compile_setup, relations, stamps, apis, source_infos, compilations)
 
   def diff(self, other):
     """Returns a list of element diffs, one per element where self and other differ."""
     element_diffs = []
     for self_elem, other_elem in zip(
-            (self.relations, self.stamps, self.apis, self.source_infos,
-             self.compilations, self.compile_setup),
-            (other.relations, other.stamps, other.apis, other.source_infos,
-             other.compilations, other.compile_setup)):
+            (self.compile_setup, self.relations, self.stamps, self.apis,
+             self.source_infos, self.compilations),
+            (other.compile_setup, other.relations, other.stamps, other.apis,
+             other.source_infos, other.compilations)):
       element_diff = self_elem.diff(other_elem)
       if element_diff.is_different():
         element_diffs.append(element_diff)
     return element_diffs
 
   def __eq__(self, other):
-    return (self.relations, self.stamps, self.apis, self.source_infos,
-            self.compilations, self.compile_setup) == \
-           (other.relations, other.stamps, other.apis, other.source_infos,
-            other.compilations, other.compile_setup)
+    return ((self.compile_setup, self.relations, self.stamps, self.apis,
+             self.source_infos, self.compilations) ==
+            (other.compile_setup, other.relations, other.stamps, other.apis,
+             other.source_infos, other.compilations))
 
   def __ne__(self, other):
     return not self.__eq__(other)
 
   def __hash__(self):
-    return hash((self.relations, self.stamps, self.apis, self.source_infos,
-                 self.compilations, self.compile_setup))
+    return hash((self.compile_setup, self.relations, self.stamps, self.apis,
+                 self.source_infos, self.compilations))
 
   # Implementation of methods required by Analysis.
 
@@ -284,9 +282,7 @@ def split(self, splits, catchall=False):
     binary_dep_splits = self._split_dict(self.relations.binary_dep, splits)
     classes_splits = self._split_dict(self.relations.classes, splits)
 
-    # For historical reasons, external deps are specified as src->class while internal deps are
-    # specified as src->src. So we pick a representative class for each src.
-    representatives = dict((k, min(vs)) for k, vs in self.relations.classes.items())
+    representatives = dict((k, self.representative(k, vs)) for k, vs in self.relations.classes.items())
 
     def split_dependencies(all_internal, all_external):
       internals = []
@@ -373,40 +369,30 @@ def split_dependencies(all_internal, all_external):
 
     analyses = []
     for relations, stamps, apis, source_infos in zip(relations_splits, stamps_splits, apis_splits, source_info_splits):
-      analyses.append(ZincAnalysis(relations, stamps, apis, source_infos, self.compilations, self.compile_setup))
+      analyses.append(ZincAnalysis(self.compile_setup, relations, stamps, apis, source_infos, self.compilations))
 
     return analyses
 
   def write(self, outfile, rebasings=None):
     outfile.write(ZincAnalysis.FORMAT_VERSION_LINE)
+    self.compile_setup.write(outfile, inline_vals=True, rebasings=rebasings)
     self.relations.write(outfile, rebasings=rebasings)
     self.stamps.write(outfile, rebasings=rebasings)
     self.apis.write(outfile, inline_vals=False, rebasings=rebasings)
     self.source_infos.write(outfile, inline_vals=False, rebasings=rebasings)
     self.compilations.write(outfile, inline_vals=True, rebasings=rebasings)
-    self.compile_setup.write(outfile, inline_vals=True, rebasings=rebasings)
 
   # Extra methods on this class only.
 
-  # Anonymize the contents of this analysis. Useful for creating test data.
+  # Translate the contents of this analysis. Useful for creating anonymized test data.
   # Note that the resulting file is not a valid analysis, as the base64-encoded serialized objects
   # will be replaced with random base64 strings. So these are useful for testing analysis parsing,
   # splitting and merging, but not for actually reading into Zinc.
   def translate(self, token_translator):
-    for element in [self.relations, self.stamps, self.apis, self.source_infos,
-                    self.compilations, self.compile_setup]:
+    for element in [self.compile_setup, self.relations, self.stamps, self.apis,
+                    self.source_infos, self.compilations]:
       element.translate(token_translator)
 
-  # Write this analysis to JSON.
-  def write_json_to_path(self, outfile_path):
-    with open(outfile_path, 'w') as outfile:
-      self.write_json(outfile)
-
-  def write_json(self, outfile):
-    obj = dict(zip(('relations', 'stamps', 'apis', 'source_infos', 'compilations', 'compile_setup'),
-                     (self.relations, self.stamps, self.apis, self.source_infos, self.compilations, self.compile_setup)))
-    json.dump(obj, outfile, cls=ZincAnalysisJSONEncoder, sort_keys=True, indent=2)
-
   def _split_dict(self, d, splits):
     """Split a dict by its keys.
 
@@ -422,6 +408,39 @@ def _split_dict(self, d, splits):
       ret.append(dict_split)
     return ret
 
+  def representative(self, src, classes):
+    """Pick a representative class for each src.
+
+    For historical reasons, external deps are specified as src->class while internal deps are
+    specified as src->src.  So when splitting we need to pick a representative.  We must pick
+    consistently.
+    """
+    primary_class_name = os.path.splitext(os.path.basename(src))[0]
+    for fqcn in classes:
+      if fqcn.rsplit('.', 1)[-1] == primary_class_name:
+        # For ease of debugging, pick the class with the same name as the source file, if it exists.
+        return primary_class_name
+    # Pick the class that sorts lowest in dictionary order.
+    return min(classes)
+
+
+class CompileSetup(ZincAnalysisElement):
+  headers = ('output mode', 'output directories','compile options','javac options',
+             'compiler version', 'compile order', 'name hashing')
+
+  def __init__(self, args):
+    super(CompileSetup, self).__init__(args)
+    (self.output_mode, self.output_dirs, self.compile_options, self.javac_options,
+     self.compiler_version, self.compile_order, self.name_hashing) = self.args
+
+  def translate(self, token_translator):
+    self.translate_values(token_translator, self.output_dirs)
+    for k, vs in list(self.compile_options.items()):  # Make a copy, so we can del as we go.
+      # Remove mentions of custom plugins.
+      for v in vs:
+        if v.startswith('-Xplugin') or v.startswith('-P'):
+          del self.compile_options[k]
+
 
 class Relations(ZincAnalysisElement):
   headers = ('products', 'binary dependencies',
@@ -513,37 +532,3 @@ def __init__(self, args):
 
   def translate(self, token_translator):
     pass
-
-
-class CompileSetup(ZincAnalysisElement):
-  headers = ('output mode', 'output directories','compile options','javac options',
-             'compiler version', 'compile order')
-
-  def __init__(self, args):
-    super(CompileSetup, self).__init__(args)
-    (self.output_mode, self.output_dirs, self.compile_options, self.javac_options,
-     self.compiler_version, self.compile_order) = self.args
-
-  def translate(self, token_translator):
-    self.translate_values(token_translator, self.output_dirs)
-    for k, vs in list(self.compile_options.items()):  # Make a copy, so we can del as we go.
-      # Remove mentions of custom plugins.
-      for v in vs:
-        if v.startswith('-Xplugin') or v.startswith('-P'):
-          del self.compile_options[k]
-
-
-class ZincAnalysisJSONEncoder(json.JSONEncoder):
-  """A custom encoder for writing analysis elements as JSON.
-
-  Not currently used, but might be useful in the future, e.g., for creating javascript-y
-  analysis browsing tools.
-  """
-  def default(self, obj):
-    if isinstance(obj, ZincAnalysisElement):
-      ret = {}
-      for h, a in zip(type(obj).headers, obj.args):
-        ret[h] = a
-      return ret
-    else:
-      super(ZincAnalysisJSONEncoder, self).default(obj)
diff --git a/src/python/pants/backend/jvm/tasks/jvm_compile/scala/zinc_analysis_diff.py b/src/python/pants/backend/jvm/tasks/jvm_compile/scala/zinc_analysis_diff.py
@@ -71,7 +71,7 @@ def __unicode__(self):
     for header, arg_diff in self._arg_diffs.items():
       if arg_diff.is_different():
         parts.append('Section "%s" differs:\n' % header)
-        parts.append(arg_diff)
+        parts.append(str(arg_diff))
         parts.append('\n\n')
     return ''.join(parts)  # '' is a unicode, so the entire result will be.