Skip to content

Commit

Permalink
Add support for mutable, global, append only caches (pantsbuild#9852)
Browse files Browse the repository at this point in the history
### Problem

As described in [Mutable Caching for Processes](https://docs.google.com/document/d/1n_MVVGjrkTKTPKHqRPlyfFzQyx2QioclMG_Q3DMUgYk/edit#) (thank you for the feedback!), we believe that there are two primary usecases for mutable caching in processes (see the document for more detail):
1. global append only
2. local mutable

The first usecase refers to well-behaved systems with their own global append only caches: typically, resolvers like pip/PEX, Coursier, or a conglomeration of other such artifact caches. The second usecase is out of scope for this PR.

### Solution

This change addresses the first usecase by adding support to pants for `append_only_caches` in the `Process` API. It also begins to use such a cache for PEX.

In separate commits, also removes the `unsafe_local_only_files_..` setting to fix pantsbuild#8314 (which the design doc indicates that we will not want to maintain in a pure v2 world), and adds the builder pattern for `Process` for use in tests.

### Result

v2 usages of PEX will concurrently read and write a shared cache in a configurable location. Followup changes can add support for automatically pruning these caches, or propagating the environment variables in remoting via sidechannel information (see the document for more info).
  • Loading branch information
stuhood authored May 27, 2020
1 parent 25c2dad commit bbf36f6
Show file tree
Hide file tree
Showing 26 changed files with 419 additions and 682 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import os
import re
import textwrap
import zipfile
from collections import defaultdict
from contextlib import closing
from xml.etree import ElementTree
Expand All @@ -29,7 +28,7 @@
from pants.base.hash_utils import hash_file
from pants.base.workunit import WorkUnitLabel
from pants.build_graph.resources import Resources
from pants.engine.fs import EMPTY_DIGEST, DirectoryToMaterialize, PathGlobs, PathGlobsAndRoot
from pants.engine.fs import DirectoryToMaterialize, PathGlobs, PathGlobsAndRoot
from pants.engine.process import Process
from pants.util.contextutil import open_zip
from pants.util.dirutil import fast_relpath
Expand Down Expand Up @@ -726,9 +725,6 @@ def _compile_hermetic(
)

relpath_to_analysis = fast_relpath(ctx.analysis_file, get_buildroot())
merged_local_only_scratch_inputs = self._compute_local_only_inputs(
classes_dir, relpath_to_analysis, jar_file
)

# TODO: Extract something common from Executor._create_command to make the command line
argv = image_specific_argv + [f"@{argfile_snapshot.files[0]}"]
Expand All @@ -753,7 +749,6 @@ def _compile_hermetic(
output_files=(jar_file, relpath_to_analysis),
output_directories=output_directories,
description=f"zinc compile for {ctx.target.address.spec}",
unsafe_local_only_files_because_we_favor_speed_over_correctness_for_this_rule=merged_local_only_scratch_inputs,
jdk_home=self._zinc.underlying_dist.home,
is_nailgunnable=True,
)
Expand All @@ -767,42 +762,6 @@ def _compile_hermetic(
# TODO: This should probably return a ClasspathEntry rather than a Digest
return res.output_digest

def _compute_local_only_inputs(self, classes_dir, relpath_to_analysis, jar_file):
"""Compute for the scratch inputs for Process.
If analysis file exists, then incremental compile is enabled. Otherwise, the compile is not
incremental, an empty digest will be returned.
:param classes_dir: relative path to classes dir from buildroot
:param relpath_to_analysis: relative path to zinc analysis file from buildroot
:param jar_file: relative path to z.jar from buildroot
:return: digest of merged analysis file and loose class files.
"""
if not os.path.exists(relpath_to_analysis):
return EMPTY_DIGEST

def _get_analysis_snapshot():
(_analysis_snapshot,) = self.context._scheduler.capture_snapshots(
[PathGlobsAndRoot(PathGlobs([relpath_to_analysis]), get_buildroot())]
)
return _analysis_snapshot

def _get_classes_dir_snapshot():
if self.get_options().use_classpath_jars and os.path.exists(jar_file):
with zipfile.ZipFile(jar_file, "r") as zip_ref:
zip_ref.extractall(classes_dir)

(_classes_dir_snapshot,) = self.context._scheduler.capture_snapshots(
[PathGlobsAndRoot(PathGlobs([classes_dir + "/**"]), get_buildroot())]
)
return _classes_dir_snapshot

analysis_snapshot = _get_analysis_snapshot()
classes_dir_snapshot = _get_classes_dir_snapshot()
return self.context._scheduler.merge_directories(
[analysis_snapshot.digest, classes_dir_snapshot.digest]
)

def get_zinc_compiler_classpath(self):
"""Get the classpath for the zinc compiler JVM tool.
Expand Down
8 changes: 5 additions & 3 deletions src/python/pants/backend/python/rules/hermetic_pex.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,11 @@ def create_process(

hermetic_env = dict(
PATH=create_path_env_var(python_setup.interpreter_search_paths),
PEX_ROOT="./pex_root",
PEX_ROOT=".cache/pex_root",
RUNTIME_PEX_ROOT=".cache/pex_root",
PEX_INHERIT_PATH="false",
PEX_IGNORE_RCFILES="true",
**subprocess_encoding_environment.invocation_environment_dict
**subprocess_encoding_environment.invocation_environment_dict,
)
if env:
hermetic_env.update(env)
Expand All @@ -64,5 +65,6 @@ def create_process(
input_digest=input_digest,
description=description,
env=hermetic_env,
**kwargs
append_only_caches={"pex_root"},
**kwargs,
)
2 changes: 2 additions & 0 deletions src/python/pants/engine/internals/native.py
Original file line number Diff line number Diff line change
Expand Up @@ -961,6 +961,7 @@ def new_scheduler(
build_root: str,
local_store_dir: str,
local_execution_root_dir: str,
named_caches_dir: str,
ignore_patterns: List[str],
use_gitignore: bool,
execution_options,
Expand Down Expand Up @@ -1014,6 +1015,7 @@ def ti(type_obj):
self.context.utf8_buf(build_root),
self.context.utf8_buf(local_store_dir),
self.context.utf8_buf(local_execution_root_dir),
self.context.utf8_buf(named_caches_dir),
self.context.utf8_buf_buf(ignore_patterns),
use_gitignore,
self.to_ids_buf(root_subject_types),
Expand Down
3 changes: 3 additions & 0 deletions src/python/pants/engine/internals/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def __init__(
build_root: str,
local_store_dir: str,
local_execution_root_dir: str,
named_caches_dir: str,
rules: Tuple[Rule, ...],
union_rules: Dict[Type, "OrderedSet[Type]"],
execution_options: ExecutionOptions,
Expand All @@ -109,6 +110,7 @@ def __init__(
:param work_dir: The pants work dir.
:param local_store_dir: The directory to use for storing the engine's LMDB store in.
:param local_execution_root_dir: The directory to use for local execution sandboxes.
:param named_caches_dir: The directory to use as the root for named mutable caches.
:param rules: A set of Rules which is used to compute values in the graph.
:param union_rules: A dict mapping union base types to member types so that rules can be written
against abstract union types without knowledge of downstream rulesets.
Expand All @@ -133,6 +135,7 @@ def __init__(
build_root=build_root,
local_store_dir=local_store_dir,
local_execution_root_dir=local_execution_root_dir,
named_caches_dir=named_caches_dir,
ignore_patterns=ignore_patterns,
use_gitignore=use_gitignore,
execution_options=execution_options,
Expand Down
2 changes: 2 additions & 0 deletions src/python/pants/engine/internals/scheduler_test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def mk_scheduler(
project_tree = project_tree or self.mk_fs_tree(work_dir=work_dir)
local_store_dir = os.path.realpath(safe_mkdtemp())
local_execution_root_dir = os.path.realpath(safe_mkdtemp())
named_caches_dir = os.path.realpath(safe_mkdtemp())
if execution_options is not None:
eo = asdict(DEFAULT_EXECUTION_OPTIONS)
eo.update(execution_options)
Expand All @@ -68,6 +69,7 @@ def mk_scheduler(
build_root=project_tree.build_root,
local_store_dir=local_store_dir,
local_execution_root_dir=local_execution_root_dir,
named_caches_dir=named_caches_dir,
rules=rules,
union_rules=union_rules,
execution_options=execution_options or DEFAULT_EXECUTION_OPTIONS,
Expand Down
12 changes: 4 additions & 8 deletions src/python/pants/engine/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import logging
from dataclasses import dataclass
from textwrap import dedent
from typing import Dict, Iterable, Optional, Tuple, Union
from typing import AbstractSet, Dict, Iterable, Optional, Tuple, Union

from pants.engine.fs import EMPTY_DIGEST, Digest
from pants.engine.platform import Platform, PlatformConstraint
Expand All @@ -23,17 +23,15 @@ class ProductDescription:
@frozen_after_init
@dataclass(unsafe_hash=True)
class Process:
# TODO: add a method to hack together a `process_executor` invocation command line which
# reproduces this process execution request to make debugging remote executions effortless!
argv: Tuple[str, ...]
description: str
input_digest: Digest
working_directory: Optional[str]
env: Tuple[str, ...]
append_only_caches: Tuple[str, ...]
output_files: Tuple[str, ...]
output_directories: Tuple[str, ...]
timeout_seconds: Union[int, float]
unsafe_local_only_files_because_we_favor_speed_over_correctness_for_this_rule: Digest
jdk_home: Optional[str]
is_nailgunnable: bool

Expand All @@ -45,10 +43,10 @@ def __init__(
input_digest: Digest = EMPTY_DIGEST,
working_directory: Optional[str] = None,
env: Optional[Dict[str, str]] = None,
append_only_caches: Optional[AbstractSet[str]] = None,
output_files: Optional[Iterable[str]] = None,
output_directories: Optional[Iterable[str]] = None,
timeout_seconds: Optional[Union[int, float]] = None,
unsafe_local_only_files_because_we_favor_speed_over_correctness_for_this_rule: Digest = EMPTY_DIGEST,
jdk_home: Optional[str] = None,
is_nailgunnable: bool = False,
) -> None:
Expand Down Expand Up @@ -80,13 +78,11 @@ def __init__(
self.input_digest = input_digest
self.working_directory = working_directory
self.env = tuple(itertools.chain.from_iterable((env or {}).items()))
self.append_only_caches = tuple(sorted(append_only_caches or set()))
self.output_files = tuple(output_files or ())
self.output_directories = tuple(output_directories or ())
# NB: A negative or None time value is normalized to -1 to ease the transfer to Rust.
self.timeout_seconds = timeout_seconds if timeout_seconds and timeout_seconds > 0 else -1
self.unsafe_local_only_files_because_we_favor_speed_over_correctness_for_this_rule = (
unsafe_local_only_files_because_we_favor_speed_over_correctness_for_this_rule
)
self.jdk_home = jdk_home
self.is_nailgunnable = is_nailgunnable

Expand Down
4 changes: 4 additions & 0 deletions src/python/pants/init/engine_initializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ def setup_legacy_graph(
use_gitignore,
bootstrap_options.local_store_dir,
bootstrap_options.local_execution_root_dir,
bootstrap_options.named_caches_dir,
bootstrap_options.build_file_prelude_globs,
options_bootstrapper,
build_configuration,
Expand All @@ -337,6 +338,7 @@ def setup_legacy_graph_extended(
use_gitignore: bool,
local_store_dir: str,
local_execution_root_dir: str,
named_caches_dir: str,
build_file_prelude_globs: Tuple[str, ...],
options_bootstrapper: OptionsBootstrapper,
build_configuration: BuildConfiguration,
Expand All @@ -353,6 +355,7 @@ def setup_legacy_graph_extended(
:param local_store_dir: The directory to use for storing the engine's LMDB store in.
:param local_execution_root_dir: The directory to use for local execution sandboxes.
:param named_caches_dir: The base directory for named cache storage.
:param build_file_prelude_globs: Globs to match files to be prepended to all BUILD files.
:param build_root: A path to be used as the build root. If None, then default is used.
:param native: An instance of the native-engine subsystem.
Expand Down Expand Up @@ -456,6 +459,7 @@ def build_root_singleton() -> BuildRoot:
build_root=build_root,
local_store_dir=local_store_dir,
local_execution_root_dir=local_execution_root_dir,
named_caches_dir=named_caches_dir,
rules=rules,
union_rules=union_rules,
execution_options=execution_options,
Expand Down
7 changes: 7 additions & 0 deletions src/python/pants/option/global_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,13 @@ def register_bootstrap_options(cls, register):
help="Directory to use for engine's local process execution sandboxing.",
default=tempfile.gettempdir(),
)
register(
"--named-caches-dir",
advanced=True,
help="Directory to use as the base for named global caches for processes with "
"trusted, concurrency-safe caches.",
default=os.path.join(get_pants_cachedir(), "named_caches"),
)
register(
"--remote-execution",
advanced=True,
Expand Down
3 changes: 2 additions & 1 deletion src/python/pants/testutil/engine/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,9 @@ def create_scheduler(rules, union_rules=None, validate=True, native=None):
ignore_patterns=[],
use_gitignore=False,
build_root=str(Path.cwd()),
local_store_dir="./.pants.d",
local_store_dir="./.pants.d/lmdb_store",
local_execution_root_dir="./.pants.d",
named_caches_dir="./.pants.d/named_caches",
rules=rules,
union_rules=union_rules,
execution_options=DEFAULT_EXECUTION_OPTIONS,
Expand Down
2 changes: 2 additions & 0 deletions src/python/pants/testutil/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,7 @@ def _init_engine(self, local_store_dir: Optional[str] = None) -> None:
global_options = options_bootstrapper.bootstrap_options.for_global_scope()
local_store_dir = local_store_dir or global_options.local_store_dir
local_execution_root_dir = global_options.local_execution_root_dir
named_caches_dir = global_options.named_caches_dir

# NB: This uses the long form of initialization because it needs to directly specify
# `cls.alias_groups` rather than having them be provided by bootstrap options.
Expand All @@ -413,6 +414,7 @@ def _init_engine(self, local_store_dir: Optional[str] = None) -> None:
use_gitignore=False,
local_store_dir=local_store_dir,
local_execution_root_dir=local_execution_root_dir,
named_caches_dir=named_caches_dir,
build_file_prelude_globs=(),
glob_match_error_behavior=GlobMatchErrorBehavior.error,
native=init_native(),
Expand Down
1 change: 1 addition & 0 deletions src/rust/engine/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions src/rust/engine/engine_cffi/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,7 @@ pub extern "C" fn scheduler_create(
build_root_buf: Buffer,
local_store_dir_buf: Buffer,
local_execution_root_dir_buf: Buffer,
named_caches_dir_buf: Buffer,
ignore_patterns_buf: BufferBuffer,
use_gitignore: bool,
root_type_ids: TypeIdBuffer,
Expand Down Expand Up @@ -348,6 +349,7 @@ pub extern "C" fn scheduler_create(
build_root_buf,
local_store_dir_buf,
local_execution_root_dir_buf,
named_caches_dir_buf,
ignore_patterns_buf,
use_gitignore,
root_type_ids,
Expand Down Expand Up @@ -382,6 +384,7 @@ fn make_core(
build_root_buf: Buffer,
local_store_dir_buf: Buffer,
local_execution_root_dir_buf: Buffer,
named_caches_dir_buf: Buffer,
ignore_patterns_buf: BufferBuffer,
use_gitignore: bool,
root_type_ids: TypeIdBuffer,
Expand Down Expand Up @@ -483,6 +486,7 @@ fn make_core(
use_gitignore,
PathBuf::from(local_store_dir_buf.to_os_string()),
PathBuf::from(local_execution_root_dir_buf.to_os_string()),
PathBuf::from(named_caches_dir_buf.to_os_string()),
remote_execution,
remote_store_servers_vec,
if remote_execution_server_string.is_empty() {
Expand Down
32 changes: 9 additions & 23 deletions src/rust/engine/process_execution/src/cache_tests.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
use crate::{
CommandRunner as CommandRunnerTrait, Context, FallibleProcessResultWithPlatform,
PlatformConstraint, Process, ProcessMetadata,
CommandRunner as CommandRunnerTrait, Context, FallibleProcessResultWithPlatform, NamedCaches,
Process, ProcessMetadata,
};
use hashing::EMPTY_DIGEST;
use sharded_lmdb::ShardedLmdb;
use std::collections::{BTreeMap, BTreeSet};
use std::io::Write;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use store::Store;
use tempfile::TempDir;
use testutil::data::TestData;
Expand All @@ -22,12 +19,14 @@ struct RoundtripResults {
async fn run_roundtrip(script_exit_code: i8) -> RoundtripResults {
let runtime = task_executor::Executor::new(Handle::current());
let work_dir = TempDir::new().unwrap();
let named_cache_dir = TempDir::new().unwrap();
let store_dir = TempDir::new().unwrap();
let store = Store::local_only(runtime.clone(), store_dir.path()).unwrap();
let local = crate::local::CommandRunner::new(
store.clone(),
runtime.clone(),
work_dir.path().to_owned(),
NamedCaches::new(named_cache_dir.path().to_owned()),
true,
);

Expand All @@ -44,24 +43,11 @@ async fn run_roundtrip(script_exit_code: i8) -> RoundtripResults {
})
.unwrap();

let request = Process {
argv: vec![
testutil::path::find_bash(),
format!("{}", script_path.display()),
],
env: BTreeMap::new(),
working_directory: None,
input_files: EMPTY_DIGEST,
output_files: vec![PathBuf::from("roland")].into_iter().collect(),
output_directories: BTreeSet::new(),
timeout: Some(Duration::from_millis(1000)),
description: "bash".to_string(),
unsafe_local_only_files_because_we_favor_speed_over_correctness_for_this_rule:
hashing::EMPTY_DIGEST,
jdk_home: None,
target_platform: PlatformConstraint::None,
is_nailgunnable: false,
};
let request = Process::new(vec![
testutil::path::find_bash(),
format!("{}", script_path.display()),
])
.output_files(vec![PathBuf::from("roland")].into_iter().collect());

let local_result = local.run(request.clone().into(), Context::default()).await;

Expand Down
Loading

0 comments on commit bbf36f6

Please sign in to comment.