Add native support for incremental caching, and use it in jvm_compile

This moves `jvm_compile` to the `cache_target_dirs` setting in order to help kill the manual caching API before the engine refactor. In order to do that, it was necessary to add native support for incremental compilation to `cache_manager.py` and `task.py`. Optional incremental builds are implemented by copying from the previous `results_dir` for a VersionedTarget to the current one. This has the advantage that `results_dir`s are immutable (concurrent `./pants run.jvm` invocations should now be safe, regardless of changing ivy resolves). - Implement optional incrementalism for tasks, with copy-on-write `results_dir`s - Switch jvm_compile to cache_target_dirs, and drop all usage of `self.workdir` - Replace the cache_hit_callback with a rmtree of a targets' `results_dir` before extracting a cache hit - Privatize-stale and delete-unused methods for `task.py` - Move the `no_cache` tag from jvm_target to target in order to implement it in `task.py` - Add `vt.id` to the path for `vt.results_dir`, which fixed an issue where two targets with identical fingerprints would share a `results_dir` (even thought that will "usually" work, it's confusing) - Add `mark_dirty` methods to Payload and PayloadField, for testing purposes. Testing Done: https://travis-ci.org/pantsbuild/pants/builds/86336104 Bugs closed: 2394 Reviewed at https://rbcommons.com/s/twitter/r/2991/
jayantak · Oct 22, 2015 · ab571f1 · ab571f1
1 parent 64d6484
commit ab571f1
Show file tree

Hide file tree

Showing 26 changed files with 468 additions and 434 deletions.
diff --git a/pants.cache.ini b/pants.cache.ini
@@ -0,0 +1,9 @@
+# A config to globally enable caching for the purposes of manual testing.
+
+[DEFAULT]
+local_artifact_cache = %(buildroot)s/.buildcache
+
+
+[cache]
+read_from = ["%(local_artifact_cache)s"]
+write_to = ["%(local_artifact_cache)s"]
diff --git a/src/python/pants/backend/codegen/tasks/apache_thrift_gen.py b/src/python/pants/backend/codegen/tasks/apache_thrift_gen.py
@@ -66,13 +66,6 @@ def _thrift_binary(self):
     thrift_binary = ThriftBinary.Factory.scoped_instance(self).create()
     return thrift_binary.path
 
-  def invalidate_for_files(self):
-    # TODO: This will prevent artifact caching across platforms.
-    # Find some cross-platform way to assert the thrift binary version.
-    # NB: We have access to the version via the ThriftBinary instance's `version`, we just need
-    # support for invalidation based on non-files.
-    return [self._thrift_binary]
-
   @memoized_property
   def _deps(self):
     deps = self.get_options().deps

diff --git a/src/python/pants/backend/core/tasks/task.py b/src/python/pants/backend/core/tasks/task.py
@@ -236,15 +236,6 @@ def artifact_cache_reads_enabled(self):
   def artifact_cache_writes_enabled(self):
     return self._cache_factory.write_cache_available()
 
-  def invalidate_for_files(self):
-    """Provides extra files that participate in invalidation.
-
-    Subclasses can override and return a list of full paths to extra, non-source files that should
-    be checked for changes when managing target invalidation. This is useful for tracking
-    changes to pre-built build tools, e.g., the thrift compiler.
-    """
-    return []
-
   def invalidate(self):
     """Invalidates all targets for this task."""
     BuildInvalidator(self._build_invalidator_dir).force_invalidate_all()
@@ -279,6 +270,26 @@ def cache_target_dirs(self):
     """
     return False
 
+  @property
+  def incremental(self):
+    """Whether this Task implements incremental building of individual targets.
+
+    Incremental tasks with `cache_target_dirs` set will have the results_dir of the previous build
+    for a target cloned into the results_dir for the current build (where possible). This
+    copy-on-write behaviour allows for immutability of the results_dir once a target has been
+    marked valid.
+    """
+    return False
+
+  @property
+  def cache_incremental(self):
+    """For incremental tasks, indicates whether the results of incremental builds should be cached.
+
+    Deterministic per-target incremental compilation is a relatively difficult thing to implement,
+    so this property provides an escape hatch to avoid caching things in that riskier case.
+    """
+    return False
+
   @contextmanager
   def invalidated(self,
                   targets,
@@ -355,9 +366,7 @@ def invalidated(self,
       invalidation_check = \
         InvalidationCheck(invalidation_check.all_vts, uncached_vts, partition_size_hint, colors)
 
-    if self.cache_target_dirs:
-      for vt in invalidation_check.all_vts:
-        vt.create_results_dir(os.path.join(self.workdir, vt.cache_key.hash))
+    self._maybe_create_results_dirs(invalidation_check.all_vts)
 
     if not silent:
       targets = []
@@ -393,11 +402,27 @@ def invalidated(self,
                       and self.artifact_cache_writes_enabled()
                       and invalidation_check.invalid_vts)
     if write_to_cache:
-      def result_files(vt):
-        return [os.path.join(vt.results_dir, f) for f in os.listdir(vt.results_dir)]
-      pairs = [(vt, result_files(vt)) for vt in invalidation_check.invalid_vts]
+      pairs = []
+      for vt in invalidation_check.invalid_vts:
+        if self._should_cache(vt):
+          pairs.append((vt, [vt.results_dir]))
       self.update_artifact_cache(pairs)
 
+  def _should_cache(self, vt):
+    """Return true if the given vt should be written to a cache (if configured)."""
+    if vt.target.has_label('no_cache'):
+      return False
+    elif not vt.is_incremental or self.cache_incremental:
+      return True
+    else:
+      return False
+
+  def _maybe_create_results_dirs(self, vts):
+    """If `cache_target_dirs`, create results_dirs for the given versioned targets."""
+    if self.cache_target_dirs:
+      for vt in vts:
+        vt.create_results_dir(self.workdir, allow_incremental=self.incremental)
+
   def check_artifact_cache_for(self, invalidation_check):
     """Decides which VTS to check the artifact cache for.
 
@@ -414,14 +439,11 @@ def check_artifact_cache(self, vts):
     """
     return self.do_check_artifact_cache(vts)
 
-  def do_check_artifact_cache(self, vts, post_process_cached_vts=None, cache_hit_callback=None):
+  def do_check_artifact_cache(self, vts, post_process_cached_vts=None):
     """Checks the artifact cache for the specified list of VersionedTargetSets.
 
     Returns a pair (cached, uncached) of VersionedTargets that were
     satisfied/unsatisfied from the cache.
-
-    :param cache_hit_callback: A serializable function that expects a CacheKey as an argument.
-      Called after a cache hit, but before the cached artifact is extracted.
     """
     if not vts:
       return [], []
@@ -430,7 +452,8 @@ def do_check_artifact_cache(self, vts, post_process_cached_vts=None, cache_hit_c
     uncached_vts = OrderedSet(vts)
 
     read_cache = self._cache_factory.get_read_cache()
-    items = [(read_cache, vt.cache_key, cache_hit_callback) for vt in vts]
+    items = [(read_cache, vt.cache_key, vt.results_dir if vt.has_results_dir else None)
+             for vt in vts]
 
     res = self.context.subproc_map(call_use_cached_files, items)
 
@@ -441,6 +464,8 @@ def do_check_artifact_cache(self, vts, post_process_cached_vts=None, cache_hit_c
       elif isinstance(was_in_cache, UnreadableArtifact):
         self._cache_key_errors.update(was_in_cache.key)
 
+    self._maybe_create_results_dirs(vts)
+
     # Note that while the input vts may represent multiple targets (for tasks that overrride
     # check_artifact_cache_for), the ones we return must represent single targets.
     def flatten(vts):
@@ -459,12 +484,12 @@ def update_artifact_cache(self, vts_artifactfiles_pairs):
       - vts is single VersionedTargetSet.
       - artifactfiles is a list of absolute paths to artifacts for the VersionedTargetSet.
     """
-    update_artifact_cache_work = self.get_update_artifact_cache_work(vts_artifactfiles_pairs)
+    update_artifact_cache_work = self._get_update_artifact_cache_work(vts_artifactfiles_pairs)
     if update_artifact_cache_work:
       self.context.submit_background_work_chain([update_artifact_cache_work],
                                                 parent_workunit_name='cache')
 
-  def get_update_artifact_cache_work(self, vts_artifactfiles_pairs):
+  def _get_update_artifact_cache_work(self, vts_artifactfiles_pairs):
     """Create a Work instance to update an artifact cache, if we're configured to.
 
     vts_artifactfiles_pairs - a list of pairs (vts, artifactfiles) where

diff --git a/src/python/pants/backend/jvm/targets/jvm_target.py b/src/python/pants/backend/jvm/targets/jvm_target.py
@@ -32,7 +32,6 @@ def __init__(self,
                provides=None,
                excludes=None,
                resources=None,
-               no_cache=False,
                services=None,
                platform=None,
                **kwargs):
@@ -42,7 +41,6 @@ def __init__(self,
     :param sources: Source code files to build. Paths are relative to the BUILD
        file's directory.
     :type sources: ``Fileset`` (from globs or rglobs) or list of strings
-    :param no_cache: If True, this should not be stored in the artifact cache
     :param services: A dict mapping service interface names to the classes owned by this target
                      that implement them.  Keys are fully qualified service class names, values are
                      lists of strings, each string the fully qualified class name of a class owned
@@ -74,8 +72,6 @@ def __init__(self,
     self._services = services or {}
 
     self.add_labels('jvm')
-    if no_cache:
-      self.add_labels('no_cache')
 
   @property
   def platform(self):

diff --git a/src/python/pants/backend/jvm/tasks/jvm_compile/analysis_parser.py b/src/python/pants/backend/jvm/tasks/jvm_compile/analysis_parser.py
@@ -122,21 +122,20 @@ def parse_num_items(self, line):
       raise ParseError('Expected: "<num> items". Found: "{0}"'.format(line))
     return int(matchobj.group(1))
 
-  def rebase_from_path(self, infile_path, outfile_path, pants_home_from, pants_home_to,
-                       java_home=None):
+  def rebase_from_path(self, infile_path, outfile_path, old_base, new_base, java_home=None):
     """Rebase an analysis at infile_path, writing the result to outfile_path.
 
     See rebase() below for an explanation of rebasing.
     """
     with open(infile_path, 'rb') as infile:
       with open(outfile_path, 'wb') as outfile:
-        self.rebase(infile, outfile, pants_home_from, pants_home_to, java_home)
+        self.rebase(infile, outfile, old_base, new_base, java_home)
 
-  def rebase(self, infile, outfile, pants_home_from, pants_home_to, java_home=None):
+  def rebase(self, infile, outfile, old_base, new_base, java_home=None):
     """Rebase an analysis read from infile and write the result to outfile.
 
-    Rebasing means replacing references to paths under pants_home_from with references to
-    equivalent paths under pants_home_to.
+    Rebasing means replacing references to paths under old_base with references to
+    equivalent paths under new_base.
 
     If java_home is specified then any references to paths under it are scrubbed entirely.
     """

diff --git a/src/python/pants/backend/jvm/tasks/jvm_compile/analysis_tools.py b/src/python/pants/backend/jvm/tasks/jvm_compile/analysis_tools.py
@@ -51,6 +51,9 @@ def merge_from_paths(self, analysis_paths, merged_analysis_path):
     merged_analysis = self._analysis_cls.merge(analyses)
     merged_analysis.write_to_path(merged_analysis_path)
 
+  def rebase_from_path(self, infile_path, outfile_path, old_base, new_base):
+    self.parser.rebase_from_path(infile_path, outfile_path, old_base, new_base, java_home=None)
+
   def relativize(self, src_analysis, relativized_analysis):
     with temporary_dir() as tmp_analysis_dir:
       tmp_analysis_file = os.path.join(tmp_analysis_dir, 'analysis.relativized')

diff --git a/src/python/pants/backend/jvm/tasks/jvm_compile/compile_context.py b/src/python/pants/backend/jvm/tasks/jvm_compile/compile_context.py
@@ -18,11 +18,14 @@ class CompileContext(object):
   and a finalized compile in its permanent location.
   """
 
-  def __init__(self, target, analysis_file, classes_dir, jar_file, sources):
+  def __init__(self, target, analysis_file, portable_analysis_file, classes_dir, jar_file,
+               log_file, sources):
     self.target = target
     self.analysis_file = analysis_file
+    self.portable_analysis_file = portable_analysis_file
     self.classes_dir = classes_dir
     self.jar_file = jar_file
+    self.log_file = log_file
     self.sources = sources
 
   @contextmanager