Skip to content

Commit

Permalink
repo.collect: Add duplicates option.
Browse files Browse the repository at this point in the history
Defaults to `False`.
If `True`, multiple `outs` sharing a provided `target_path` will not be filtered.
  • Loading branch information
daavoo authored and efiop committed Jun 21, 2022
1 parent 735b563 commit 66c2e8c
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 5 deletions.
12 changes: 7 additions & 5 deletions dvc/repo/collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ def _collect_paths(
return target_paths


def _filter_duplicates(
outs: Outputs, fs_paths: StrPaths
def _filter_outs(
outs: Outputs, fs_paths: StrPaths, duplicates=False
) -> Tuple[Outputs, StrPaths]:
res_outs: Outputs = []
fs_res_paths = fs_paths
Expand All @@ -61,8 +61,9 @@ def _filter_duplicates(
fs_path = out.repo.dvcfs.from_os_path(out.fs_path)
if fs_path in fs_paths:
res_outs.append(out)
# MUTATING THE SAME LIST!!
fs_res_paths.remove(fs_path)
if not duplicates:
# MUTATING THE SAME LIST!!
fs_res_paths.remove(fs_path)

return res_outs, fs_res_paths

Expand All @@ -74,6 +75,7 @@ def collect(
output_filter: FilterFn = None,
rev: str = None,
recursive: bool = False,
duplicates: bool = False,
) -> Tuple[Outputs, StrPaths]:
assert targets or output_filter

Expand All @@ -85,4 +87,4 @@ def collect(

target_paths = _collect_paths(repo, targets, recursive=recursive, rev=rev)

return _filter_duplicates(outs, target_paths)
return _filter_outs(outs, target_paths, duplicates=duplicates)
17 changes: 17 additions & 0 deletions tests/unit/test_collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,20 @@ def test_no_file_on_target_rev(tmp_dir, scm, dvc, caplog):
collect(dvc, targets=["file.yaml"], rev="current_branch")

assert "'file.yaml' was not found at: 'current_branch'." in caplog.text


def test_collect_duplicates(tmp_dir, scm, dvc):
tmp_dir.gen("params.yaml", "foo: 1\nbar: 2")
tmp_dir.gen("foobar", "")

dvc.run(name="stage-1", cmd="echo stage-1", params=["foo"])
dvc.run(name="stage-2", cmd="echo stage-2", params=["bar"])

outs, _ = collect(dvc, deps=True, targets=["params.yaml"])
assert len(outs) == 1

outs, _ = collect(dvc, deps=True, targets=["params.yaml"], duplicates=True)
assert len(outs) == 2

outs, _ = collect(dvc, deps=True, targets=["foobar"], duplicates=True)
assert not outs

0 comments on commit 66c2e8c

Please sign in to comment.