Skip to content

Commit

Permalink
diff: add support for --targets (iterative#4938)
Browse files Browse the repository at this point in the history
* wip: add target support (back) to dvc diff

* Change diff param to targets and update repo.diff accordingly

* Update dvc diff --targets with PR review feedback

* Tests for dvc diff --targets

* Update dvc diff --targets with PR review feedback

* Update dvc diff --targets with PR review feedback and additional tests

* split out new diff target tests as per PR comment

* remove unnecessary cache dir removal from setup_targets_test

* Update dvc/command/diff.py

Co-authored-by: Ruslan Kuprieiev <[email protected]>
  • Loading branch information
sandeepmistry and efiop authored Dec 15, 2020
1 parent 2512df7 commit 31502ae
Show file tree
Hide file tree
Showing 3 changed files with 204 additions and 11 deletions.
13 changes: 12 additions & 1 deletion dvc/command/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import colorama

from dvc.command import completion
from dvc.command.base import CmdBase, append_doc_link
from dvc.exceptions import DvcException

Expand Down Expand Up @@ -127,7 +128,9 @@ def _format(diff, hide_missing=False):

def run(self):
try:
diff = self.repo.diff(self.args.a_rev, self.args.b_rev)
diff = self.repo.diff(
self.args.a_rev, self.args.b_rev, self.args.targets
)
show_hash = self.args.show_hash
hide_missing = self.args.b_rev or self.args.hide_missing
if hide_missing:
Expand Down Expand Up @@ -167,6 +170,14 @@ def add_parser(subparsers, parent_parser):
help=DIFF_DESCRIPTION,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
diff_parser.add_argument(
"--targets",
nargs="*",
help=(
"Limit command scope to these tracked files or directories. "
"Accepts one or more file paths."
),
).complete = completion.FILE
diff_parser.add_argument(
"a_rev",
help="Old Git commit to compare (defaults to HEAD)",
Expand Down
64 changes: 54 additions & 10 deletions dvc/repo/diff.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import os

from dvc.exceptions import PathMissingError
from dvc.repo import locked
from dvc.tree.local import LocalTree
from dvc.tree.repo import RepoTree
Expand All @@ -9,7 +10,7 @@


@locked
def diff(self, a_rev="HEAD", b_rev=None):
def diff(self, a_rev="HEAD", b_rev=None, targets=None):
"""
By default, it compares the workspace with the last commit's tree.
Expand All @@ -23,12 +24,28 @@ def diff(self, a_rev="HEAD", b_rev=None):

b_rev = b_rev if b_rev else "workspace"
results = {}
missing_targets = {}
for rev in self.brancher(revs=[a_rev, b_rev]):
if rev == "workspace" and rev != b_rev:
# brancher always returns workspace, but we only need to compute
# workspace paths/checksums if b_rev was None
continue
results[rev] = _paths_checksums(self)

targets_path_infos = None
if targets is not None:
# convert targets to path_infos, and capture any missing targets
targets_path_infos, missing_targets[rev] = _targets_to_path_infos(
self, targets
)

results[rev] = _paths_checksums(self, targets_path_infos)

if targets is not None:
# check for overlapping missing targets between a_rev and b_rev
for target in set(missing_targets[a_rev]) & set(
missing_targets[b_rev]
):
raise PathMissingError(target, self)

old = results[a_rev]
new = results[b_rev]
Expand Down Expand Up @@ -62,7 +79,7 @@ def diff(self, a_rev="HEAD", b_rev=None):
return ret if any(ret.values()) else {}


def _paths_checksums(repo):
def _paths_checksums(repo, targets):
"""
A dictionary of checksums addressed by relpaths collected from
the current tree outputs.
Expand All @@ -74,10 +91,10 @@ def _paths_checksums(repo):
file: "data"
"""

return dict(_output_paths(repo))
return dict(_output_paths(repo, targets))


def _output_paths(repo):
def _output_paths(repo, targets):
repo_tree = RepoTree(repo, stream=True)
on_working_tree = isinstance(repo.tree, LocalTree)

Expand All @@ -101,17 +118,29 @@ def _to_checksum(output):
for stage in repo.stages:
for output in stage.outs:
if _exists(output):
yield _to_path(output), _to_checksum(output)
if output.is_dir_checksum:
yield from _dir_output_paths(repo_tree, output)
yield_output = targets is None or any(
output.path_info.isin_or_eq(target) for target in targets
)

if yield_output:
yield _to_path(output), _to_checksum(output)

def _dir_output_paths(repo_tree, output):
if output.is_dir_checksum and (
yield_output
or any(target.isin(output.path_info) for target in targets)
):
yield from _dir_output_paths(repo_tree, output, targets)


def _dir_output_paths(repo_tree, output, targets=None):
from dvc.config import NoRemoteError

try:
for fname in repo_tree.walk_files(output.path_info):
yield str(fname), repo_tree.get_file_hash(fname).value
if targets is None or any(
fname.isin_or_eq(target) for target in targets
):
yield str(fname), repo_tree.get_file_hash(fname).value
except NoRemoteError:
logger.warning("dir cache entry for '%s' is missing", output)

Expand All @@ -124,3 +153,18 @@ def _filter_missing(repo, paths):
out = metadata.outs[0]
if out.status().get(str(out)) == "not in cache":
yield path


def _targets_to_path_infos(repo, targets):
path_infos = []
missing = []

repo_tree = RepoTree(repo, stream=True)

for target in targets:
if repo_tree.exists(target):
path_infos.append(repo_tree.metadata(target).path_info)
else:
missing.append(target)

return path_infos, missing
138 changes: 138 additions & 0 deletions tests/func/test_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,3 +264,141 @@ def test_no_commits(tmp_dir):
assert Git().no_commits

assert Repo.init().diff() == {}


def setup_targets_test(tmp_dir):
tmp_dir.dvc_gen("file", "first", commit="add a file")

tmp_dir.dvc_gen({"dir": {"1": "1", "2": "2"}})
tmp_dir.dvc_gen("file", "second")

tmp_dir.dvc_gen(os.path.join("dir_with", "file.txt"), "first")


def test_targets_missing_path(tmp_dir, scm, dvc):
from dvc.exceptions import PathMissingError

setup_targets_test(tmp_dir)

with pytest.raises(PathMissingError):
dvc.diff(targets=["missing"])


def test_targets_single_file(tmp_dir, scm, dvc):
setup_targets_test(tmp_dir)

assert dvc.diff(targets=["file"]) == {
"added": [],
"deleted": [],
"modified": [
{
"path": "file",
"hash": {"old": digest("first"), "new": digest("second")},
}
],
"not in cache": [],
}


def test_targets_single_dir(tmp_dir, scm, dvc):
setup_targets_test(tmp_dir)

dir_checksum = "5fb6b29836c388e093ca0715c872fe2a.dir"

expected_result = {
"added": [
{"path": os.path.join("dir", ""), "hash": dir_checksum},
{"path": os.path.join("dir", "1"), "hash": digest("1")},
{"path": os.path.join("dir", "2"), "hash": digest("2")},
],
"deleted": [],
"modified": [],
"not in cache": [],
}

assert dvc.diff(targets=["dir"]) == expected_result
assert dvc.diff(targets=["dir" + os.path.sep]) == expected_result


def test_targets_single_file_in_dir(tmp_dir, scm, dvc):
setup_targets_test(tmp_dir)

assert dvc.diff(targets=[os.path.join("dir", "1")]) == {
"added": [{"path": os.path.join("dir", "1"), "hash": digest("1")}],
"deleted": [],
"modified": [],
"not in cache": [],
}


def test_targets_two_files_in_dir(tmp_dir, scm, dvc):
setup_targets_test(tmp_dir)

assert dvc.diff(
targets=[os.path.join("dir", "1"), os.path.join("dir", "2")]
) == {
"added": [
{"path": os.path.join("dir", "1"), "hash": digest("1")},
{"path": os.path.join("dir", "2"), "hash": digest("2")},
],
"deleted": [],
"modified": [],
"not in cache": [],
}


def test_targets_file_and_dir(tmp_dir, scm, dvc):
setup_targets_test(tmp_dir)

dir_checksum = "5fb6b29836c388e093ca0715c872fe2a.dir"

assert dvc.diff(targets=["file", "dir"]) == {
"added": [
{"path": os.path.join("dir", ""), "hash": dir_checksum},
{"path": os.path.join("dir", "1"), "hash": digest("1")},
{"path": os.path.join("dir", "2"), "hash": digest("2")},
],
"deleted": [],
"modified": [
{
"path": "file",
"hash": {"old": digest("first"), "new": digest("second")},
}
],
"not in cache": [],
}


def test_targets_single_dir_with_file(tmp_dir, scm, dvc):
setup_targets_test(tmp_dir)

expected_result = {
"added": [
{
"path": os.path.join("dir_with", "file.txt"),
"hash": digest("first"),
},
],
"deleted": [],
"modified": [],
"not in cache": [],
}

assert dvc.diff(targets=["dir_with"]) == expected_result
assert dvc.diff(targets=["dir_with" + os.path.sep]) == expected_result


def test_targets_single_file_in_dir_with_file(tmp_dir, scm, dvc):
setup_targets_test(tmp_dir)

assert dvc.diff(targets=[os.path.join("dir_with", "file.txt")]) == {
"added": [
{
"path": os.path.join("dir_with", "file.txt"),
"hash": digest("first"),
},
],
"deleted": [],
"modified": [],
"not in cache": [],
}

0 comments on commit 31502ae

Please sign in to comment.