Add dependency inference for Shell (#11857)

This uses a suggestion from @jsirois to run Shellcheck in isolation against each file, so that Shellcheck reports unrecognized source files, which we then extract the paths from the JSON error message. Outside of this new parsing logic, this is mostly copy pasta from our Protobuf<->Protobuf dependency inference implementation. [ci skip-build-wheels]
pantsbuild · Apr 8, 2021 · 1d6e3bb · 1d6e3bb
1 parent 7ef8880
commit 1d6e3bb
Show file tree

Hide file tree

Showing 8 changed files with 374 additions and 15 deletions.
diff --git a/BUILD b/BUILD
@@ -1,17 +1,12 @@
 # Copyright 2019 Pants project contributors (see CONTRIBUTORS.md).
 # Licensed under the Apache License, Version 2.0 (see LICENSE).
 
+shell_library(name="scripts", sources=["cargo", "pants"])
+
 # We use this to establish the build root, rather than `./pants`, because we cannot safely use the
 # latter as the sentinel filename per https://github.com/pantsbuild/pants/pull/8105.
 files(name='build_root', sources=["BUILD_ROOT"])
 
 files(name='gitignore', sources=['.gitignore'])
 files(name='pants_toml', sources=['pants.toml'])
 files(name='pyproject', sources=['pyproject.toml'])
-
-shell_library(
-    name="pants",
-    sources=["pants"],
-    dependencies=["build-support/common.sh", "build-support/pants_venv", "build-support/bin/rust/bootstrap_code.sh"],
-)
-shell_library(name="cargo", sources=["cargo"], dependencies=["build-support/common.sh"])
diff --git a/build-support/bin/BUILD b/build-support/bin/BUILD
@@ -1,10 +1,7 @@
 # Copyright 2019 Pants project contributors (see CONTRIBUTORS.md).
 # Licensed under the Apache License, Version 2.0 (see LICENSE).
 
-shell_library(
-    name="sh_scripts",
-    dependencies=["build-support/common.sh", "build-support/pants_venv", "build-support/bin/rust/bootstrap_code.sh"],
-)
+shell_library(name="sh_scripts")
 
 python_library(name="py_scripts")
 python_tests(name="py_tests", timeout=90)  # reversion_test.py times out occasionally.

diff --git a/build-support/bin/rust/BUILD b/build-support/bin/rust/BUILD
@@ -1,4 +1,4 @@
 # Copyright 2021 Pants project contributors (see CONTRIBUTORS.md).
 # Licensed under the Apache License, Version 2.0 (see LICENSE).
 
-shell_library(dependencies=["build-support/common.sh", "build-support/pants_venv"])
+shell_library()
diff --git a/build-support/githooks/BUILD b/build-support/githooks/BUILD
@@ -1,4 +1,4 @@
 # Copyright 2021 Pants project contributors (see CONTRIBUTORS.md).
 # Licensed under the Apache License, Version 2.0 (see LICENSE).
 
-shell_library(sources=["pre-commit", "prepare-commit-msg"], dependencies=["build-support/common.sh"])
+shell_library(sources=["pre-commit", "prepare-commit-msg"])
diff --git a/src/python/pants/backend/shell/dependency_inference.py b/src/python/pants/backend/shell/dependency_inference.py
@@ -0,0 +1,199 @@
+# Copyright 2021 Pants project contributors (see CONTRIBUTORS.md).
+# Licensed under the Apache License, Version 2.0 (see LICENSE).
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import DefaultDict
+
+from pants.backend.shell.lint.shellcheck.subsystem import Shellcheck
+from pants.backend.shell.shell_setup import ShellSetup
+from pants.backend.shell.target_types import ShellSources
+from pants.base.specs import AddressSpecs, DescendantAddresses
+from pants.core.util_rules.external_tool import DownloadedExternalTool, ExternalToolRequest
+from pants.engine.addresses import Address
+from pants.engine.collection import DeduplicatedCollection
+from pants.engine.fs import Digest, DigestSubset, MergeDigests, PathGlobs
+from pants.engine.platform import Platform
+from pants.engine.process import FallibleProcessResult, Process
+from pants.engine.rules import Get, MultiGet, collect_rules, rule
+from pants.engine.target import (
+    Dependencies,
+    DependenciesRequest,
+    ExplicitlyProvidedDependencies,
+    HydratedSources,
+    HydrateSourcesRequest,
+    InferDependenciesRequest,
+    InferredDependencies,
+    SourcesPaths,
+    SourcesPathsRequest,
+    Targets,
+    WrappedTarget,
+)
+from pants.engine.unions import UnionRule
+from pants.util.frozendict import FrozenDict
+from pants.util.logging import LogLevel
+from pants.util.ordered_set import OrderedSet
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class ShellMapping:
+    """A mapping of Shell file names to their owning file address."""
+
+    mapping: FrozenDict[str, Address]
+    ambiguous_modules: FrozenDict[str, tuple[Address, ...]]
+
+
+@rule(desc="Creating map of Shell file names to Shell targets", level=LogLevel.DEBUG)
+async def map_shell_files() -> ShellMapping:
+    all_expanded_targets = await Get(Targets, AddressSpecs([DescendantAddresses("")]))
+    shell_tgts = tuple(tgt for tgt in all_expanded_targets if tgt.has_field(ShellSources))
+    sources_per_target = await MultiGet(
+        Get(SourcesPaths, SourcesPathsRequest(tgt[ShellSources])) for tgt in shell_tgts
+    )
+
+    files_to_addresses: dict[str, Address] = {}
+    files_with_multiple_owners: DefaultDict[str, set[Address]] = defaultdict(set)
+    for tgt, sources in zip(shell_tgts, sources_per_target):
+        for f in sources.files:
+            if f in files_to_addresses:
+                files_with_multiple_owners[f].update({files_to_addresses[f], tgt.address})
+            else:
+                files_to_addresses[f] = tgt.address
+
+    # Remove files with ambiguous owners.
+    for ambiguous_f in files_with_multiple_owners:
+        files_to_addresses.pop(ambiguous_f)
+
+    return ShellMapping(
+        mapping=FrozenDict(sorted(files_to_addresses.items())),
+        ambiguous_modules=FrozenDict(
+            (k, tuple(sorted(v))) for k, v in sorted(files_with_multiple_owners.items())
+        ),
+    )
+
+
+class ParsedShellImports(DeduplicatedCollection):
+    sort_input = True
+
+
+@dataclass(frozen=True)
+class ParseShellImportsRequest:
+    # NB: We parse per-file, rather than per-target. This is necessary so that we can have each
+    # file in complete isolation without its sibling files present so that Shellcheck errors when
+    # trying to source a sibling file, which then allows us to extract that path.
+    digest: Digest
+    fp: str
+
+
+PATH_FROM_SHELLCHECK_ERROR = re.compile(r"Not following: (.+) was not specified as input")
+
+
+@rule
+async def parse_shell_imports(
+    request: ParseShellImportsRequest, shellcheck: Shellcheck
+) -> ParsedShellImports:
+    # We use Shellcheck to parse for us by running it against each file in isolation, which means
+    # that all `source` statements will error. Then, we can extract the problematic paths from the
+    # JSON output.
+    downloaded_shellcheck = await Get(
+        DownloadedExternalTool, ExternalToolRequest, shellcheck.get_request(Platform.current)
+    )
+    input_digest = await Get(Digest, MergeDigests([request.digest, downloaded_shellcheck.digest]))
+    process_result = await Get(
+        FallibleProcessResult,
+        Process(
+            # NB: We do not load up `[shellcheck].{args,config}` because it would risk breaking
+            # determinism of dependency inference in an unexpected way.
+            [downloaded_shellcheck.exe, "--format=json", request.fp],
+            input_digest=input_digest,
+            description=f"Detect Shell imports for {request.fp}",
+            level=LogLevel.DEBUG,
+        ),
+    )
+
+    try:
+        output = json.loads(process_result.stdout)
+    except json.JSONDecodeError:
+        logger.error(
+            f"Parsing {request.fp} for dependency inference failed because Shellcheck's output "
+            f"could not be loaded as JSON. Please open a GitHub issue at "
+            f"https://github.com/pantsbuild/pants/issues/new with this error message attached.\n\n"
+            f"\nshellcheck version: {shellcheck.version}\n"
+            f"process_result.stdout: {process_result.stdout.decode()}"
+        )
+        return ParsedShellImports()
+
+    paths = set()
+    for error in output:
+        if not error.get("code", "") == 1091:
+            continue
+        msg = error.get("message", "")
+        matches = PATH_FROM_SHELLCHECK_ERROR.match(msg)
+        if matches:
+            paths.add(matches.group(1))
+        else:
+            logger.error(
+                f"Parsing {request.fp} for dependency inference failed because Shellcheck's error "
+                f"message was not in the expected format. Please open a GitHub issue at "
+                f"https://github.com/pantsbuild/pants/issues/new with this error message "
+                f"attached.\n\n\nshellcheck version: {shellcheck.version}\n"
+                f"error JSON entry: {error}"
+            )
+    return ParsedShellImports(paths)
+
+
+class InferShellDependencies(InferDependenciesRequest):
+    infer_from = ShellSources
+
+
+@rule(desc="Inferring Shell dependencies by analyzing imports")
+async def infer_shell_dependencies(
+    request: InferShellDependencies, shell_mapping: ShellMapping, shell_setup: ShellSetup
+) -> InferredDependencies:
+    if not shell_setup.dependency_inference:
+        return InferredDependencies([], sibling_dependencies_inferrable=False)
+
+    address = request.sources_field.address
+    wrapped_tgt = await Get(WrappedTarget, Address, address)
+    explicitly_provided_deps, hydrated_sources = await MultiGet(
+        Get(ExplicitlyProvidedDependencies, DependenciesRequest(wrapped_tgt.target[Dependencies])),
+        Get(HydratedSources, HydrateSourcesRequest(request.sources_field)),
+    )
+    per_file_digests = await MultiGet(
+        Get(Digest, DigestSubset(hydrated_sources.snapshot.digest, PathGlobs([f])))
+        for f in hydrated_sources.snapshot.files
+    )
+    all_detected_imports = await MultiGet(
+        Get(ParsedShellImports, ParseShellImportsRequest(digest, f))
+        for digest, f in zip(per_file_digests, hydrated_sources.snapshot.files)
+    )
+
+    result: OrderedSet[Address] = OrderedSet()
+    for detected_imports in all_detected_imports:
+        for import_path in detected_imports:
+            unambiguous = shell_mapping.mapping.get(import_path)
+            ambiguous = shell_mapping.ambiguous_modules.get(import_path)
+            if unambiguous:
+                result.add(unambiguous)
+            elif ambiguous:
+                explicitly_provided_deps.maybe_warn_of_ambiguous_dependency_inference(
+                    ambiguous,
+                    address,
+                    import_reference="file",
+                    context=f"The target {address} sources `{import_path}`",
+                )
+                maybe_disambiguated = explicitly_provided_deps.disambiguated_via_ignores(ambiguous)
+                if maybe_disambiguated:
+                    result.add(maybe_disambiguated)
+    return InferredDependencies(sorted(result), sibling_dependencies_inferrable=True)
+
+
+def rules():
+    return (*collect_rules(), UnionRule(InferDependenciesRequest, InferShellDependencies))
diff --git a/src/python/pants/backend/shell/dependency_inference_test.py b/src/python/pants/backend/shell/dependency_inference_test.py
@@ -0,0 +1,154 @@
+# Copyright 2021 Pants project contributors (see CONTRIBUTORS.md).
+# Licensed under the Apache License, Version 2.0 (see LICENSE).
+
+from __future__ import annotations
+
+from textwrap import dedent
+
+import pytest
+
+from pants.backend.shell import dependency_inference
+from pants.backend.shell.dependency_inference import (
+    InferShellDependencies,
+    ParsedShellImports,
+    ParseShellImportsRequest,
+    ShellMapping,
+)
+from pants.backend.shell.target_types import ShellLibrary, ShellSources
+from pants.core.util_rules import external_tool
+from pants.engine.addresses import Address
+from pants.engine.target import InferredDependencies
+from pants.testutil.rule_runner import QueryRule, RuleRunner
+from pants.util.frozendict import FrozenDict
+
+
+@pytest.fixture
+def rule_runner() -> RuleRunner:
+    return RuleRunner(
+        rules=[
+            *dependency_inference.rules(),
+            *external_tool.rules(),
+            QueryRule(ShellMapping, []),
+            QueryRule(ParsedShellImports, [ParseShellImportsRequest]),
+            QueryRule(InferredDependencies, [InferShellDependencies]),
+        ],
+        target_types=[ShellLibrary],
+    )
+
+
+def test_shell_mapping(rule_runner: RuleRunner) -> None:
+    rule_runner.write_files(
+        {
+            # Two Shell files belonging to the same target. We should use two file addresses.
+            "a/f1.sh": "",
+            "a/f2.sh": "",
+            "a/BUILD": "shell_library()",
+            # >1 target owns the same file, so it's ambiguous.
+            "b/f.sh": "",
+            "b/BUILD": "shell_library(name='t1')\nshell_library(name='t2')",
+        }
+    )
+    result = rule_runner.request(ShellMapping, [])
+    assert result == ShellMapping(
+        mapping=FrozenDict(
+            {
+                "a/f1.sh": Address("a", relative_file_path="f1.sh"),
+                "a/f2.sh": Address("a", relative_file_path="f2.sh"),
+            }
+        ),
+        ambiguous_modules=FrozenDict(
+            {
+                "b/f.sh": (
+                    Address("b", target_name="t1", relative_file_path="f.sh"),
+                    Address("b", target_name="t2", relative_file_path="f.sh"),
+                )
+            }
+        ),
+    )
+
+
+def test_parse_imports(rule_runner: RuleRunner) -> None:
+    def parse(content: str) -> set[str]:
+        snapshot = rule_runner.make_snapshot({"subdir/f.sh": content})
+        return set(
+            rule_runner.request(
+                ParsedShellImports, [ParseShellImportsRequest(snapshot.digest, "subdir/f.sh")]
+            )
+        )
+
+    assert not parse("")
+    assert not parse("#!/usr/bin/env bash")
+    assert not parse("def python():\n  print('hi')")
+    assert parse("source a/b.sh") == {"a/b.sh"}
+    assert parse(". a/b.sh") == {"a/b.sh"}
+    assert parse("source ../parent.sh") == {"../parent.sh"}
+    assert parse("echo foo\nsource foo.sh\necho bar; source bar.sh") == {"foo.sh", "bar.sh"}
+
+    # Can use a Shellcheck directive to fix unrecognized imports.
+    assert not parse("source ${FOO}")
+    assert parse("# shellcheck source=a/b.sh\nsource ${FOO}") == {"a/b.sh"}
+
+
+def test_dependency_inference(rule_runner: RuleRunner, caplog) -> None:
+    rule_runner.write_files(
+        {
+            "a/f1.sh": dedent(
+                """\
+                source b/f.sh
+                source unknown/f.sh
+                """
+            ),
+            "a/f2.sh": "source a/f1.sh",
+            "a/BUILD": "shell_library()",
+            "b/f.sh": "",
+            "b/BUILD": "shell_library()",
+            # Test handling of ambiguous imports. We should warn on the ambiguous dependency, but
+            # not warn on the disambiguated one and should infer a dep.
+            "ambiguous/dep.sh": "",
+            "ambiguous/disambiguated.sh": "",
+            "ambiguous/main.sh": dedent(
+                """\
+                source ambiguous/dep.sh
+                source ambiguous/disambiguated.sh
+                """
+            ),
+            "ambiguous/BUILD": dedent(
+                """\
+                shell_library(name='dep1', sources=['dep.sh', 'disambiguated.sh'])
+                shell_library(name='dep2', sources=['dep.sh', 'disambiguated.sh'])
+                shell_library(
+                    name='main',
+                    sources=['main.sh'],
+                    dependencies=['!./disambiguated.sh:dep2'],
+                )
+                """
+            ),
+        }
+    )
+
+    def run_dep_inference(address: Address) -> InferredDependencies:
+        tgt = rule_runner.get_target(address)
+        return rule_runner.request(
+            InferredDependencies, [InferShellDependencies(tgt[ShellSources])]
+        )
+
+    build_address = Address("a")
+    assert run_dep_inference(build_address) == InferredDependencies(
+        [Address("b", relative_file_path="f.sh"), Address("a", relative_file_path="f1.sh")],
+        sibling_dependencies_inferrable=True,
+    )
+
+    file_address = Address("a", relative_file_path="f1.sh")
+    assert run_dep_inference(file_address) == InferredDependencies(
+        [Address("b", relative_file_path="f.sh")], sibling_dependencies_inferrable=True
+    )
+
+    caplog.clear()
+    assert run_dep_inference(Address("ambiguous", target_name="main")) == InferredDependencies(
+        [Address("ambiguous", target_name="dep1", relative_file_path="disambiguated.sh")],
+        sibling_dependencies_inferrable=True,
+    )
+    assert len(caplog.records) == 1
+    assert "The target ambiguous:main sources `ambiguous/dep.sh`" in caplog.text
+    assert "['ambiguous/dep.sh:dep1', 'ambiguous/dep.sh:dep2']" in caplog.text
+    assert "disambiguated.sh" not in caplog.text