Skip to content

Commit

Permalink
[engine] Rust IO (pantsbuild#4265)
Browse files Browse the repository at this point in the history
### Problem

Porting the scheduling of tasks to rust helped to minimize its overhead, but we were still left with a huge number of python objects and the single-core bottleneck of python for the filesystem leaf nodes in the execution graph (the majority).

### Solution

pantsbuild#4221 and pantsbuild#4261 laid the groundwork for executing concurrent IO on the rust side, and this patch enables that. `fs.py` was ported to `fs.rs` with relatively few changes (the largest of which is probably the use of `Future`s).

One significant difference though, is that the IO API in the engine has raised away from low level filesystem operations toward a higher level abstraction. The most fundamental operation exposed to users is now "get me a `Snapshot` for these `PathGlobs`"... ie, "get me a fingerprinted tarball of the files matching these globs". This change had two goals: 1) to make the API boundary between rust and python thin, 2) to prepare for remote execution of tasks by capturing inputs into atomic shippable units. 

### Result

We're able to get multiple cores involved while executing `./pants list ::`, and we're capturing shippable snapshots. Unfortunately, due to pantsbuild#4298, the speedup is not as significant as it should be: will work to fix that issue over the next two weeks.
  • Loading branch information
Stu Hood authored Mar 2, 2017
1 parent cbdb975 commit 53c3e30
Show file tree
Hide file tree
Showing 53 changed files with 2,614 additions and 1,398 deletions.
5 changes: 3 additions & 2 deletions src/python/pants/base/cmd_line_spec_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,5 +68,6 @@ def parse_spec(self, spec):
return SiblingAddresses(self._normalize_spec_path(spec_path))
else:
spec_parts = spec.rsplit(':', 1)
return SingleAddress(self._normalize_spec_path(spec_parts[0]),
spec_parts[1] if len(spec_parts) > 1 else None)
spec_path = self._normalize_spec_path(spec_parts[0])
name = spec_parts[1] if len(spec_parts) > 1 else os.path.basename(spec_path)
return SingleAddress(spec_path, name)
15 changes: 8 additions & 7 deletions src/python/pants/base/project_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ def __init__(self, build_root, ignore_patterns=None):
'ProjectTree build_root {} must be an absolute path.'.format(build_root))
self.build_root = os.path.realpath(build_root)
logger.debug('ProjectTree ignore_patterns: %s', ignore_patterns)
self.ignore = PathSpec.from_lines(GitWildMatchPattern, ignore_patterns if ignore_patterns else [])
self.ignore_patterns = ignore_patterns if ignore_patterns else []
self.ignore = PathSpec.from_lines(GitWildMatchPattern, self.ignore_patterns)

@abstractmethod
def _glob1_raw(self, dir_relpath, glob):
Expand Down Expand Up @@ -140,7 +141,7 @@ def walk(self, relpath, topdown=True):
matched_dirs = self.ignore.match_files([os.path.join(root, "{}/".format(d)) for d in dirs])
matched_files = self.ignore.match_files([os.path.join(root, f) for f in files])
for matched_dir in matched_dirs:
dirs.remove(fast_relpath(matched_dir, root).rstrip('/'))
dirs.remove(fast_relpath(matched_dir, root).rstrip(b'/'))

for matched_file in matched_files:
files.remove(fast_relpath(matched_file, root))
Expand Down Expand Up @@ -180,15 +181,15 @@ def _filter_ignored(self, entries, selector=None):
return [entry for path, entry in prefixed_entries if path not in ignored_paths]

def _relpath_no_dot(self, relpath):
return relpath.lstrip('./') if relpath != '.' else ''
return relpath.lstrip(b'./') if relpath != b'.' else b''

def _raise_access_ignored(self, relpath):
"""Raises exception when accessing ignored path."""
raise self.AccessIgnoredPathError('The path {} is ignored in {}'.format(relpath, self))

def _append_trailing_slash(self, relpath):
"""Add a trailing slash if not already has one."""
return relpath if relpath.endswith('/') or len(relpath) == 0 else relpath + '/'
return relpath if relpath.endswith(b'/') or len(relpath) == 0 else relpath + b'/'

def _append_slash_if_dir_path(self, relpath):
"""For a dir path return a path that has a trailing slash."""
Expand All @@ -214,18 +215,18 @@ class File(datatype('File', ['path']), Stat):
"""A file."""

def __new__(cls, path):
return super(File, cls).__new__(cls, six.text_type(path))
return super(File, cls).__new__(cls, six.binary_type(path))


class Dir(datatype('Dir', ['path']), Stat):
"""A directory."""

def __new__(cls, path):
return super(Dir, cls).__new__(cls, six.text_type(path))
return super(Dir, cls).__new__(cls, six.binary_type(path))


class Link(datatype('Link', ['path']), Stat):
"""A symbolic link."""

def __new__(cls, path):
return super(Link, cls).__new__(cls, six.text_type(path))
return super(Link, cls).__new__(cls, six.binary_type(path))
3 changes: 3 additions & 0 deletions src/python/pants/base/scm_project_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import os
from types import NoneType

from pants.base.deprecated import deprecated
from pants.base.project_tree import Dir, File, Link, ProjectTree
from pants.util.dirutil import fast_relpath
from pants.util.memo import memoized
Expand All @@ -19,6 +20,8 @@


class ScmProjectTree(ProjectTree):
@deprecated('1.5.0.dev0',
hint_message="ScmProjectTree was lightly used, and is now deprecated.")
def __init__(self, build_root, scm, rev, ignore_patterns=None):
super(ScmProjectTree, self).__init__(build_root, ignore_patterns)
self._scm = scm
Expand Down
15 changes: 7 additions & 8 deletions src/python/pants/base/specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,16 @@ def to_spec_string(self):


class SingleAddress(datatype('SingleAddress', ['directory', 'name']), Spec):
"""A Spec for a single address, with an optional name.
"""A Spec for a single address."""

If the address name is None, then the default address for the directory is assumed...
ie, the address with the same name as the directory.
"""
def __new__(cls, directory, name):
if directory is None or name is None:
raise ValueError('A SingleAddress must have both a directory and name. Got: '
'{}:{}'.format(directory, name))
return super(SingleAddress, cls).__new__(cls, directory, name)

def to_spec_string(self):
if self.name:
return '{}:{}'.format(self.directory, self.name)
else:
return self.directory
return '{}:{}'.format(self.directory, self.name)


class SiblingAddresses(datatype('SiblingAddresses', ['directory']), Spec):
Expand Down
4 changes: 4 additions & 0 deletions src/python/pants/bin/daemon_pants_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,10 @@ def post_fork_child(self):
# Clean global state.
clean_global_runtime_state(reset_subsystem=True)

# Reinitialize scheduler threads.
if self._graph_helper:
self._graph_helper.scheduler.post_fork()

# Re-raise any deferred exceptions, if present.
self._raise_deferred_exc()

Expand Down
2 changes: 1 addition & 1 deletion src/python/pants/bin/engine_initializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def setup_legacy_graph(pants_ignore_patterns,
# LegacyBuildGraph will explicitly request the products it needs.
tasks = (
create_legacy_graph_tasks(symbol_table_cls) +
create_fs_tasks() +
create_fs_tasks(project_tree) +
create_graph_tasks(address_mapper, symbol_table_cls)
)

Expand Down
3 changes: 1 addition & 2 deletions src/python/pants/build_graph/address_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,7 @@ def is_valid_single_address(self, single_address):
single_address, type(single_address), SingleAddress))

try:
self.scan_specs([single_address])
return True
return bool(self.scan_specs([single_address]))
except AddressLookupError:
return False

Expand Down
149 changes: 66 additions & 83 deletions src/python/pants/engine/build_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,16 @@
unicode_literals, with_statement)

import collections
from fnmatch import fnmatch
from os.path import basename, dirname, join
from os.path import dirname, join

import six

from pants.base.project_tree import Dir, File
from pants.base.project_tree import Dir
from pants.base.specs import (AscendantAddresses, DescendantAddresses, SiblingAddresses,
SingleAddress)
from pants.build_graph.address import Address, BuildFileAddress
from pants.engine.addressable import AddressableDescriptor, Addresses, Exactly, TypeConstraintError
from pants.engine.fs import DirectoryListing, Files, FilesContent, Path, PathGlobs
from pants.engine.fs import FilesContent, PathGlobs, Snapshot
from pants.engine.mapper import AddressFamily, AddressMap, AddressMapper, ResolveError
from pants.engine.objects import Locatable, SerializableFactory, Validatable
from pants.engine.selectors import Select, SelectDependencies, SelectLiteral, SelectProjection
Expand All @@ -37,35 +36,33 @@ class BuildDirs(datatype('BuildDirs', ['dependencies'])):
"""A list of Stat objects for directories containing build files."""


class BuildFiles(datatype('BuildFiles', ['files'])):
"""A list of Paths that are known to match a build file pattern."""
class BuildFiles(datatype('BuildFiles', ['files_content'])):
"""The FileContents of BUILD files in some directory"""


def filter_buildfile_paths(address_mapper, directory_listing):
if not directory_listing.exists:
raise ResolveError('Directory "{}" does not exist.'.format(directory_listing.directory.path))
class BuildFileGlobs(datatype('BuildFilesGlobs', ['path_globs'])):
"""A wrapper around PathGlobs that are known to match a build file pattern."""

def match(stat):
# Short circuit for ignored paths.
if address_mapper.build_ignore_patterns.match_file(stat.path):
return False

return (type(stat) is File and any(fnmatch(basename(stat.path), pattern)
for pattern in address_mapper.build_patterns))
build_files = tuple(Path(stat.path, stat)
for stat in directory_listing.dependencies if match(stat))
return BuildFiles(build_files)
def buildfile_path_globs_for_dir(address_mapper, directory):
patterns = address_mapper.build_patterns
return BuildFileGlobs(PathGlobs.create(directory.path, include=patterns, exclude=()))


def parse_address_family(address_mapper, path, build_files_content):
def parse_address_family(address_mapper, path, build_files):
"""Given the contents of the build files in one directory, return an AddressFamily.
The AddressFamily may be empty, but it will not be None.
"""
if not build_files_content.dependencies:
files_content = build_files.files_content.dependencies
if not files_content:
raise ResolveError('Directory "{}" does not contain build files.'.format(path))
address_maps = []
for filecontent_product in build_files_content.dependencies:
paths = (f.path for f in files_content)
ignored_paths = set(address_mapper.build_ignore_patterns.match_files(paths))
for filecontent_product in files_content:
if filecontent_product.path in ignored_paths:
continue
address_maps.append(AddressMap.parse(filecontent_product.path,
filecontent_product.content,
address_mapper.symbol_table_cls,
Expand Down Expand Up @@ -212,44 +209,45 @@ def identity(v):
return v


def address_from_address_family(address_family, single_address):
"""Given an AddressFamily and a SingleAddress, return an Addresses object containing the Address.
Raises an exception if the SingleAddress does not match an existing Address.
"""
name = single_address.name
if name is None:
name = basename(single_address.directory)
if name not in address_family.objects_by_name:
_raise_did_you_mean(address_family, single_address.name)
return Addresses(tuple([Address(address_family.namespace, name)]))


def addresses_from_address_family(address_family):
"""Given an AddressFamily, return an Addresses objects containing all of its `addressables`."""
return Addresses(tuple(address_family.addressables.keys()))
def addresses_from_address_families(address_families, spec):
"""Given a list of AddressFamilies and a Spec, return matching Addresses."""
if type(spec) in (DescendantAddresses, SiblingAddresses, AscendantAddresses):
addresses = tuple(a for af in address_families for a in af.addressables.keys())
elif type(spec) is SingleAddress:
addresses = tuple(a
for af in address_families
for a in af.addressables.keys() if a.target_name == spec.name)
else:
raise ValueError('Unrecognized Spec type: {}'.format(spec))
return Addresses(addresses)


def addresses_from_address_families(address_families):
"""Given a list of AddressFamilies, return an Addresses object containing all addressables."""
return Addresses(tuple(a for af in address_families for a in af.addressables.keys()))


def filter_build_dirs(address_mapper, build_files):
"""Given Files matching a build pattern, return their parent directories as BuildDirs."""
dirnames = set(dirname(f.stat.path) for f in build_files.dependencies)
def filter_build_dirs(address_mapper, snapshot):
"""Given a Snapshot matching a build pattern, return parent directories as BuildDirs."""
dirnames = set(dirname(f.stat.path) for f in snapshot.files)
ignored_dirnames = address_mapper.build_ignore_patterns.match_files('{}/'.format(dirname) for dirname in dirnames)
ignored_dirnames = set(d.rstrip('/') for d in ignored_dirnames)
return BuildDirs(tuple(Dir(d) for d in dirnames if d not in ignored_dirnames))


def descendant_addresses_to_globs(address_mapper, descendant_addresses):
"""Given a DescendantAddresses object, return a PathGlobs object for matching build files.
This allows us to limit our AddressFamily requests to directories that contain build files.
"""
patterns = [join('**', pattern) for pattern in address_mapper.build_patterns]
return PathGlobs.create_from_specs(descendant_addresses.directory, patterns)
def spec_to_globs(address_mapper, spec):
"""Given a Spec object, return a PathGlobs object for the build files that it matches."""
if type(spec) is DescendantAddresses:
directory = spec.directory
patterns = [join('**', pattern) for pattern in address_mapper.build_patterns]
elif type(spec) in (SiblingAddresses, SingleAddress):
directory = spec.directory
patterns = address_mapper.build_patterns
elif type(spec) is AscendantAddresses:
directory = ''
patterns = [
join(f, pattern)
for pattern in address_mapper.build_patterns
for f in _recursive_dirname(spec.directory)
]
else:
raise ValueError('Unrecognized Spec type: {}'.format(spec))
return PathGlobs.create(directory, include=patterns, exclude=[])


def _recursive_dirname(f):
Expand All @@ -267,23 +265,17 @@ def _recursive_dirname(f):
yield ''


def ascendant_addresses_to_globs(address_mapper, ascendant_addresses):
"""Given an AscendantAddresses object, return a PathGlobs object for matching build files."""
patterns = [
join(f, pattern)
for pattern in address_mapper.build_patterns
for f in _recursive_dirname(ascendant_addresses.directory)
]
return PathGlobs.create_from_specs('', patterns)


def create_graph_tasks(address_mapper, symbol_table_cls):
"""Creates tasks used to parse Structs from BUILD files.
:param address_mapper_key: The subject key for an AddressMapper instance.
:param symbol_table_cls: A SymbolTable class to provide symbols for Address lookups.
"""
symbol_table_constraint = symbol_table_cls.constraint()
specs_constraint = Exactly(SingleAddress,
SiblingAddresses,
DescendantAddresses,
AscendantAddresses)
return [
# Support for resolving Structs from Addresses
(symbol_table_constraint,
Expand All @@ -300,37 +292,28 @@ def create_graph_tasks(address_mapper, symbol_table_cls):
(AddressFamily,
[SelectLiteral(address_mapper, AddressMapper),
Select(Dir),
SelectProjection(FilesContent, Files, ('files',), BuildFiles)],
Select(BuildFiles)],
parse_address_family),
(BuildFiles,
[SelectProjection(FilesContent, PathGlobs, ('path_globs',), BuildFileGlobs)],
BuildFiles),
(BuildFileGlobs,
[SelectLiteral(address_mapper, AddressMapper),
Select(DirectoryListing)],
filter_buildfile_paths),
] + [
# Simple spec handling.
(Addresses,
[SelectProjection(AddressFamily, Dir, ('directory',), SingleAddress),
Select(SingleAddress)],
address_from_address_family),
(Addresses,
[SelectProjection(AddressFamily, Dir, ('directory',), SiblingAddresses)],
addresses_from_address_family),
Select(Dir)],
buildfile_path_globs_for_dir),
] + [
# Recursive spec handling: locate directories that contain build files, and request
# Spec handling: locate directories that contain build files, and request
# AddressFamilies for each of them.
(Addresses,
[SelectDependencies(AddressFamily, BuildDirs, field_types=(Dir,))],
[SelectDependencies(AddressFamily, BuildDirs, field_types=(Dir,)),
Select(specs_constraint)],
addresses_from_address_families),
(BuildDirs,
[SelectLiteral(address_mapper, AddressMapper),
Select(Files)],
Select(Snapshot)],
filter_build_dirs),
(PathGlobs,
[SelectLiteral(address_mapper, AddressMapper),
Select(DescendantAddresses)],
descendant_addresses_to_globs),
(PathGlobs,
[SelectLiteral(address_mapper, AddressMapper),
Select(AscendantAddresses)],
ascendant_addresses_to_globs),
Select(specs_constraint)],
spec_to_globs),
]
Loading

0 comments on commit 53c3e30

Please sign in to comment.