Extract Python dependencies in an intrinsic (pantsbuild#18854)

This PR does two things: - Introduces a new crate to the engine: `dep_inference`. A module inside is dedicated to Python, and leverages `tree-sitter` and `tree-sitter-python` to parse Parse dependencies. `tree-sitter` was chosen because it supports Py2/3, supports other languages, and also is syntax-error-resistant. - Leverages the new crate in an intrinsic. The new behavior is forced opt-in/out and will eventually be the "only" way to do the inference. # TImings Helper script: ```shell #!/bin/bash # Replace some random numbers find src/python/pants -type f -name "*.py" -not -name "__init__.py" | xargs sed -i s/'Copyright [0123456789][0123456789][0123456789][0123456789]'/"Copyright $RANDOM"/ # Wait for the kernel really quick sleep 1 # Wait for the inotify notifications to stop while true; do mtime=$(stat -c %Y .pants.d/pants.log) now=$(date +%s) diff=$((now - mtime)) if (( diff >= 5 )); then break fi sleep $((5 - diff)) done ``` Timings follows. `./dirty_files.sh` runs test worst case scenario ( touch every copyright header). I'm on a 64 core machine, so I run as if we only had 8 cores. Findings: - In the worst case (the extraction process is not in the process cache) we blow it out of the water in terms of time saved - In the best case (the process cache is hot) we're comparable. Put another way, the time it takes to execute the rule code and lookup the process in the process cache is roughly the amount of time it takes just to parse it again ## Worst case (completely cold cache) ``` $ hyperfine --prepare ./dirty_files.sh --runs 4 --warmup 1 'pants --rule-threads-core=4 --process-execution-local-parallelism=8 --no-python-infer-use-rust-parser --filter-target-type=python_source dependencies ::' 'pants --rule-threads-core=4 --process-execution-local-parallelism=8 --python-infer-use-rust-parser --filter-target-type=python_source dependencies ::' Benchmark 1: pants --rule-threads-core=4 --process-execution-local-parallelism=8 --no-python-infer-use-rust-parser --filter-target-type=python_source dependencies :: Time (mean ± σ): 36.335 s ± 1.286 s [User: 0.754 s, System: 0.151 s] Range (min … max): 34.698 s … 37.645 s 4 runs Benchmark 2: pants --rule-threads-core=4 --process-execution-local-parallelism=8 --python-infer-use-rust-parser --filter-target-type=python_source dependencies :: Time (mean ± σ): 2.899 s ± 0.096 s [User: 0.758 s, System: 0.131 s] Range (min … max): 2.764 s … 2.990 s 4 runs Summary 'pants --rule-threads-core=4 --process-execution-local-parallelism=8 --python-infer-use-rust-parser --filter-target-type=python_source dependencies ::' ran 12.54 ± 0.61 times faster than 'pants --rule-threads-core=4 --process-execution-local-parallelism=8 --no-python-infer-use-rust-parser --filter-target-type=python_source dependencies ::' ``` ## Best Case (hot cache, but no daemon) ``` $ hyperfine --runs 4 --warmup 1 'pants --no-pantsd --rule-threads-core=4 --process-execution-local-parallelism=8 --no-python-infer-use-rust-parser --filter-target-type=python_source dependencies ::' 'pants --no-pantsd --rule-threads-core=4 --process-execution-local-parallelism=8 --python-infer-use-rust-parser --filter-target-type=python_source dependencies ::' Benchmark 1: pants --no-pantsd --rule-threads-core=4 --process-execution-local-parallelism=8 --no-python-infer-use-rust-parser --filter-target-type=python_source dependencies :: Time (mean ± σ): 20.589 s ± 0.319 s [User: 20.303 s, System: 2.002 s] Range (min … max): 20.167 s … 20.934 s 4 runs Benchmark 2: pants --no-pantsd --rule-threads-core=4 --process-execution-local-parallelism=8 --python-infer-use-rust-parser --filter-target-type=python_source dependencies :: Time (mean ± σ): 19.273 s ± 0.347 s [User: 18.881 s, System: 1.669 s] Range (min … max): 18.940 s … 19.759 s 4 runs Summary 'pants --no-pantsd --rule-threads-core=4 --process-execution-local-parallelism=8 --python-infer-use-rust-parser --filter-target-type=python_source dependencies ::' ran 1.07 ± 0.03 times faster than 'pants --no-pantsd --rule-threads-core=4 --process-execution-local-parallelism=8 --no-python-infer-use-rust-parser --filter-target-type=python_source dependencies ::' ```
pp-gborodin · May 9, 2023 · 3a20af9 · 3a20af9
1 parent 268c9f8
commit 3a20af9
Show file tree

Hide file tree

Showing 18 changed files with 1,301 additions and 60 deletions.
diff --git a/pants.toml b/pants.toml
@@ -160,6 +160,7 @@ python-default = [">=3.7,<3.10"]
 [python-infer]
 assets = true
 unowned_dependency_behavior = "error"
+use_rust_parser = false
 
 [docformatter]
 args = ["--wrap-summaries=100", "--wrap-descriptions=100"]

diff --git a/src/python/pants/backend/python/dependency_inference/parse_python_dependencies.py b/src/python/pants/backend/python/dependency_inference/parse_python_dependencies.py
@@ -1,7 +1,9 @@
 # Copyright 2020 Pants project contributors (see CONTRIBUTORS.md).
 # Licensed under the Apache License, Version 2.0 (see LICENSE).
+from __future__ import annotations
 
 import json
+import logging
 import os
 from dataclasses import dataclass
 from typing import Iterable
@@ -10,11 +12,13 @@
 from pants.backend.python.target_types import PythonSourceField
 from pants.backend.python.util_rules.interpreter_constraints import InterpreterConstraints
 from pants.backend.python.util_rules.pex_environment import PythonExecutable
+from pants.base.deprecated import warn_or_error
 from pants.core.util_rules.source_files import SourceFilesRequest
 from pants.core.util_rules.stripped_source_files import StrippedSourceFiles
 from pants.engine.collection import DeduplicatedCollection
 from pants.engine.environment import EnvironmentName
 from pants.engine.fs import CreateDigest, Digest, FileContent, MergeDigests
+from pants.engine.internals.native_dep_inference import NativeParsedPythonDependencies
 from pants.engine.process import Process, ProcessResult
 from pants.engine.rules import Get, MultiGet, collect_rules, rule
 from pants.engine.unions import UnionMembership, UnionRule, union
@@ -23,8 +27,10 @@
 from pants.util.resources import read_resource
 from pants.util.strutil import softwrap
 
+logger = logging.getLogger(__name__)
 
-@dataclass(frozen=True)
+
+@dataclass(frozen=True, order=True)
 class ParsedPythonImportInfo:
     lineno: int
     # An import is considered "weak" if we're unsure if a dependency will exist between the parsed
@@ -166,22 +172,61 @@ async def general_parser_script(
     )
 
 
-@rule
+@rule(level=LogLevel.DEBUG)
 async def parse_python_dependencies(
     request: ParsePythonDependenciesRequest,
     parser_script: ParserScript,
+    union_membership: UnionMembership,
+    python_infer_subsystem: PythonInferSubsystem,
 ) -> ParsedPythonDependencies:
-    python_interpreter, stripped_sources = await MultiGet(
-        Get(PythonExecutable, InterpreterConstraints, request.interpreter_constraints),
-        Get(StrippedSourceFiles, SourceFilesRequest([request.source])),
-    )
-
+    stripped_sources = await Get(StrippedSourceFiles, SourceFilesRequest([request.source]))
     # We operate on PythonSourceField, which should be one file.
     assert len(stripped_sources.snapshot.files) == 1
+
+    if python_infer_subsystem.options.is_default("use_rust_parser"):
+        # NB: In 2.18, we'll switch the default to `True` and then warn if the value is set (to anything)
+        # NB: In 2.19, we remove the option altogether and remove the old code.
+        warn_or_error(
+            removal_version="2.18.0.dev0",
+            entity="Not explicitly providing [python-infer].use_rust_parser",
+            hint="Read the help for [python-infer].use_rust_parser, then set the value in pants.toml.",
+        )
+
+    has_custom_dep_inferences = len(union_membership[PythonDependencyVisitorRequest]) > 1
+    if python_infer_subsystem.use_rust_parser and not has_custom_dep_inferences:
+        native_result = await Get(
+            NativeParsedPythonDependencies, Digest, stripped_sources.snapshot.digest
+        )
+        imports = dict(native_result.imports)
+        assets = set()
+
+        if python_infer_subsystem.string_imports or python_infer_subsystem.assets:
+            for string, line in native_result.string_candidates.items():
+                slash_count = string.count("/")
+                if (
+                    python_infer_subsystem.string_imports
+                    and not slash_count
+                    and string.count(".") >= python_infer_subsystem.string_imports_min_dots
+                ):
+                    imports.setdefault(string, (line, True))
+                if (
+                    python_infer_subsystem.assets
+                    and slash_count >= python_infer_subsystem.assets_min_slashes
+                ):
+                    assets.add(string)
+
+        return ParsedPythonDependencies(
+            ParsedPythonImports(
+                (key, ParsedPythonImportInfo(*value)) for key, value in imports.items()
+            ),
+            ParsedPythonAssetPaths(sorted(assets)),
+        )
+
     file = stripped_sources.snapshot.files[0]
 
-    input_digest = await Get(
-        Digest, MergeDigests([parser_script.digest, stripped_sources.snapshot.digest])
+    python_interpreter, input_digest = await MultiGet(
+        Get(PythonExecutable, InterpreterConstraints, request.interpreter_constraints),
+        Get(Digest, MergeDigests([parser_script.digest, stripped_sources.snapshot.digest])),
     )
     process_result = await Get(
         ProcessResult,

diff --git a/src/python/pants/backend/python/dependency_inference/parse_python_dependencies_test.py b/src/python/pants/backend/python/dependency_inference/parse_python_dependencies_test.py
@@ -64,6 +64,7 @@ def assert_deps_parsed(
             f"--python-infer-string-imports-min-dots={string_imports_min_dots}",
             f"--python-infer-assets={assets}",
             f"--python-infer-assets-min-slashes={assets_min_slashes}",
+            "--python-infer-use-rust-parser",
         ],
         env_inherit={"PATH", "PYENV_ROOT", "HOME"},
     )
@@ -109,11 +110,11 @@ def test_normal_imports(rule_runner: RuleRunner) -> None:
             ignored1 as alias1,  # pants: no-infer-dep
             ignored2 as \\
                 alias2,  # pants: no-infer-dep
-            ignored3 as  # pants: no-infer-dep
-                alias3,
-            ignored4 as alias4, ignored4,  # pants: no-infer-dep
-            not_ignored2, \\
-            not_ignored3
+            ignored3 as
+                alias3,  # pants: no-infer-dep
+            not_ignored2 as alias4, ignored3,  # pants: no-infer-dep
+            not_ignored3, \\
+            not_ignored4
         )
         from multiline_import2 import (ignored1,  # pants: no-infer-dep
             not_ignored)
@@ -126,7 +127,6 @@ def test_normal_imports(rule_runner: RuleRunner) -> None:
         rule_runner,
         content,
         expected_imports={
-            "__future__.print_function": ImpInfo(lineno=1, weak=False),
             "os": ImpInfo(lineno=3, weak=False),
             "os.path": ImpInfo(lineno=5, weak=False),
             "typing.TYPE_CHECKING": ImpInfo(lineno=6, weak=False),
@@ -135,8 +135,9 @@ def test_normal_imports(rule_runner: RuleRunner) -> None:
             "project.demo.Demo": ImpInfo(lineno=11, weak=False),
             "project.demo.OriginalName": ImpInfo(lineno=12, weak=False),
             "multiline_import1.not_ignored1": ImpInfo(lineno=16, weak=False),
-            "multiline_import1.not_ignored2": ImpInfo(lineno=23, weak=False),
-            "multiline_import1.not_ignored3": ImpInfo(lineno=24, weak=False),
+            "multiline_import1.not_ignored2": ImpInfo(lineno=22, weak=False),
+            "multiline_import1.not_ignored3": ImpInfo(lineno=23, weak=False),
+            "multiline_import1.not_ignored4": ImpInfo(lineno=24, weak=False),
             "multiline_import2.not_ignored": ImpInfo(lineno=27, weak=False),
             "project.circular_dep.CircularDep": ImpInfo(lineno=30, weak=False),
         },
@@ -149,16 +150,16 @@ def test_dunder_import_call(rule_runner: RuleRunner) -> None:
         __import__("pkg_resources")
         __import__("dunder_import_ignored")  # pants: no-infer-dep
         __import__(  # pants: no-infer-dep
-            "not_ignored_but_looks_like_it_could_be"
+            "ignored"
         )
         __import__(
-            "ignored"  # pants: no-infer-dep
+            "also_ignored"  # pants: no-infer-dep
         )
         __import__(
-            "also_not_ignored_but_looks_like_it_could_be"
+            "also_also_ignored"
         )  # pants: no-infer-dep
         __import__(
-            "ignored_too" \\
+            "not_ignored" \\
             # pants: no-infer-dep
         )
         __import__(
@@ -171,8 +172,7 @@ def test_dunder_import_call(rule_runner: RuleRunner) -> None:
         content,
         expected_imports={
             "pkg_resources": ImpInfo(lineno=1, weak=False),
-            "not_ignored_but_looks_like_it_could_be": ImpInfo(lineno=4, weak=False),
-            "also_not_ignored_but_looks_like_it_could_be": ImpInfo(lineno=10, weak=False),
+            "not_ignored": ImpInfo(lineno=13, weak=False),
         },
     )
 
@@ -299,15 +299,19 @@ def test_imports_from_strings(rule_runner: RuleRunner, min_dots: int) -> None:
             'a.b2.c.D',
             'a.b.c_狗',
 
-            # Definitely invalid strings
+            # Invalid module names, but we don't check that hard
             '..a.b.c.d',
-            'a.B.d',
             'a.2b.d',
             'a..b..c',
             'a.b.c.d.2Bar',
-            'a.b_c.D.bar',
-            'a.b_c.D.Bar',
             'a.2b.c.D',
+
+            # Definitely invalid strings
+            'I/have/a/slash',
+            'I\\\\have\\\\backslashes',
+            'I have whitespace',
+            '\\ttabby',
+            '\\nnewliney',
         ]
 
         for module in modules:
@@ -327,13 +331,24 @@ def test_imports_from_strings(rule_runner: RuleRunner, min_dots: int) -> None:
         "a.b_c.d._bar": ImpInfo(lineno=11, weak=True),
         "a.b2.c.D": ImpInfo(lineno=12, weak=True),
         "a.b.c_狗": ImpInfo(lineno=13, weak=True),
+        "..a.b.c.d": ImpInfo(lineno=16, weak=True),
+        "a.2b.d": ImpInfo(lineno=17, weak=True),
+        "a..b..c": ImpInfo(lineno=18, weak=True),
+        "a.b.c.d.2Bar": ImpInfo(lineno=19, weak=True),
+        "a.2b.c.D": ImpInfo(lineno=20, weak=True),
     }
     expected = {sym: info for sym, info in potentially_valid.items() if sym.count(".") >= min_dots}
 
     assert_deps_parsed(
-        rule_runner, content, expected_imports=expected, string_imports_min_dots=min_dots
+        rule_runner,
+        content,
+        expected_imports=expected,
+        string_imports_min_dots=min_dots,
+        assets=False,
+    )
+    assert_deps_parsed(
+        rule_runner, content, string_imports=False, expected_imports={}, assets=False
     )
-    assert_deps_parsed(rule_runner, content, string_imports=False, expected_imports={})
 
 
 def test_real_import_beats_string_import(rule_runner: RuleRunner) -> None:
@@ -501,6 +516,7 @@ def test_works_with_python39(rule_runner: RuleRunner) -> None:
             "treat.as.a.regular.import.not.a.string.import": ImpInfo(lineno=11, weak=False),
             "dep.from.str": ImpInfo(lineno=13, weak=True),
         },
+        expected_assets=["/dev/null"],
     )
 
 
@@ -518,6 +534,9 @@ def test_assets(rule_runner: RuleRunner, min_slashes: int) -> None:
             'data/subdir1/subdir2/a.json',
             'data/subdir1/subdir2/subdir3/a.json',
             '狗/狗.狗',
+            'data/a.b/c.d',
+            'data/extensionless',
+            'a/........',
 
             # Looks weird, but Unix and pathlib treat repeated "/" as one slash.
             # Our parsing, however considers this as multiple slashes.
@@ -526,13 +545,12 @@ def test_assets(rule_runner: RuleRunner, min_slashes: int) -> None:
 
             # Probably invalid assets.
             'noslashes',
-            'data/database',  # Unfortunately, extenionless files don't get matched.
 
             # Definitely invalid assets.
-            'a/........',
+            'I have whitespace',
+            '\\ttabby\\ttabby',
             '\\n/foo.json',
-            'data/a.b/c.d',
-            'windows\\style.txt',
+            'windows\\\\style.txt',
         ]
         """
     )
@@ -546,6 +564,9 @@ def test_assets(rule_runner: RuleRunner, min_slashes: int) -> None:
         "data/subdir1/subdir2/a.json",
         "data/subdir1/subdir2/subdir3/a.json",
         "狗/狗.狗",
+        "data/a.b/c.d",
+        "data/extensionless",
+        "a/........",
         "//foo.bar",
         "//foo/////bar.txt",
     }

diff --git a/src/python/pants/backend/python/dependency_inference/subsystem.py b/src/python/pants/backend/python/dependency_inference/subsystem.py
@@ -190,3 +190,31 @@ class PythonInferSubsystem(Subsystem):
         """
         ),
     )
+
+    use_rust_parser = BoolOption(
+        default=False,
+        help=softwrap(
+            f"""
+            Use the new Rust-based, multithreaded, in-process dependency parser.
+
+            Pants 2.17 introduced a new paradigm to dependency parsing for Python by leveraging a
+            Rust-based parser that's called in the same process as Pants itself, instead of farming
+            out to one-python-process-per-file.
+
+            As a result of the switch, cold-cache performance improved by a factor of about 12x,
+            while hot-cache had no difference. Additionally, Pants can now infer dependencies from
+            Python scripts with syntax errors.
+
+            However, since this parser is completely different it has the potential of introducing
+            differences in dependency inference. Although the Pants suite of tests only identified
+            differences when using the `# pants: no-infer-dep` pragma, and string-based
+            imports/assets, Out of an abundance of caution, this is behind a feature flag that will
+            eventually be on-by-default then removed.
+
+            It is recommended that you run `{bin_name()} peek :: > before.json` and then
+            `{bin_name()} --python-infer-use-rust-parser peek :: > after.json` and compare the two
+            results. If all looks good, set `use_rust_parser` in `[python-infer]` in `pants.toml`.
+            If you think there's a bug please file an issue: https://github.com/pantsbuild/pants/issues/new/choose.
+            """
+        ),
+    )
diff --git a/src/python/pants/backend/python/goals/package_pex_binary_integration_test.py b/src/python/pants/backend/python/goals/package_pex_binary_integration_test.py
@@ -63,7 +63,9 @@ def rule_runner() -> PythonRuleRunner:
         ],
         objects={"python_artifact": PythonArtifact},
     )
-    rule_runner.set_options([], env_inherit={"PATH", "PYENV_ROOT", "HOME"})
+    rule_runner.set_options(
+        ["--python-infer-use-rust-parser"], env_inherit={"PATH", "PYENV_ROOT", "HOME"}
+    )
     return rule_runner
 
 

diff --git a/src/python/pants/engine/internals/native_dep_inference.py b/src/python/pants/engine/internals/native_dep_inference.py
@@ -0,0 +1,18 @@
+# Copyright 2023 Pants project contributors (see CONTRIBUTORS.md).
+# Licensed under the Apache License, Version 2.0 (see LICENSE).
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from pants.util.frozendict import FrozenDict
+
+
+@dataclass(frozen=True)
+class NativeParsedPythonDependencies:
+    imports: FrozenDict[str, tuple[int, bool]]
+    string_candidates: FrozenDict[str, int]
+
+    def __init__(self, imports: dict[str, tuple[int, bool]], string_candidates: dict[str, int]):
+        object.__setattr__(self, "imports", FrozenDict(imports))
+        object.__setattr__(self, "string_candidates", FrozenDict(string_candidates))
diff --git a/src/python/pants/engine/internals/scheduler.py b/src/python/pants/engine/internals/scheduler.py
@@ -35,6 +35,7 @@
 from pants.engine.goal import CurrentExecutingGoals, Goal
 from pants.engine.internals import native_engine
 from pants.engine.internals.docker import DockerResolveImageRequest, DockerResolveImageResult
+from pants.engine.internals.native_dep_inference import NativeParsedPythonDependencies
 from pants.engine.internals.native_engine import (
     PyExecutionRequest,
     PyExecutionStrategyOptions,
@@ -170,6 +171,7 @@ def __init__(
             engine_aware_parameter=EngineAwareParameter,
             docker_resolve_image_request=DockerResolveImageRequest,
             docker_resolve_image_result=DockerResolveImageResult,
+            parsed_python_deps_result=NativeParsedPythonDependencies,
         )
         remoting_options = PyRemotingOptions(
             execution_enable=execution_options.remote_execution,