Skip to content

Commit

Permalink
gc: Handle remote field in --cloud and --not-in-remote.
Browse files Browse the repository at this point in the history
  • Loading branch information
daavoo committed Apr 27, 2023
1 parent c7e2ec9 commit 64e787f
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 21 deletions.
55 changes: 39 additions & 16 deletions dvc/repo/gc.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import logging
from typing import TYPE_CHECKING, List, Optional
from typing import TYPE_CHECKING, List, Optional, Set, Tuple

from dvc.exceptions import InvalidArgumentError

from . import locked

if TYPE_CHECKING:
from dvc.repo import Repo
from dvc_data.hashfile.hash_info import HashInfo
from dvc_objects.db import ObjectDB

logger = logging.getLogger(__name__)

Expand All @@ -28,8 +30,22 @@ def _validate_args(**kwargs):
raise InvalidArgumentError("`--num` can only be used alongside `--rev`")


def _used_obj_ids_not_in_remote(repo, default_remote, jobs, remote_odb_to_obj_ids):
used_obj_ids = set()
remote_oids = set()
for remote_odb, obj_ids in remote_odb_to_obj_ids:
if remote_odb is None:
remote_odb = repo.cloud.get_remote_odb(default_remote, "gc --not-in-remote")

remote_oids.update(
remote_odb.list_oids_exists({x.value for x in obj_ids}, jobs=jobs)
)
used_obj_ids.update(obj_ids)
return {obj for obj in used_obj_ids if obj.value not in remote_oids}


@locked
def gc( # noqa: PLR0913, C901
def gc( # noqa: PLR0912, PLR0913, C901
self: "Repo",
all_branches: bool = False,
cloud: bool = False,
Expand Down Expand Up @@ -73,13 +89,14 @@ def gc( # noqa: PLR0913, C901
repos = []
all_repos = [Repo(path) for path in repos]

used_obj_ids = set()
odb_to_obj_ids: List[Tuple[Optional["ObjectDB"], Set["HashInfo"]]] = []

with ExitStack() as stack:
for repo in all_repos:
stack.enter_context(repo.lock)

for repo in [*all_repos, self]:
for obj_ids in repo.used_objs(
for odb, obj_ids in repo.used_objs(
all_branches=all_branches,
with_deps=with_deps,
all_tags=all_tags,
Expand All @@ -91,28 +108,34 @@ def gc( # noqa: PLR0913, C901
jobs=jobs,
revs=[rev] if rev else None,
num=num or 1,
).values():
used_obj_ids.update(obj_ids)
).items():
odb_to_obj_ids.append((odb, obj_ids))

if not odb_to_obj_ids:
odb_to_obj_ids = [(None, set())]

if not_in_remote:
cloud_odb = self.cloud.get_remote_odb(remote, "gc --not-in-remote")
remote_hashes = list(cloud_odb.all(jobs=jobs))
used_obj_ids = {x for x in used_obj_ids if x.value not in remote_hashes}
used_obj_ids = _used_obj_ids_not_in_remote(self, remote, jobs, odb_to_obj_ids)
else:
used_obj_ids = set()
for _, obj_ids in odb_to_obj_ids:
used_obj_ids.update(obj_ids)

for scheme, odb in self.cache.by_scheme():
if not odb:
continue

removed = ogc(odb, used_obj_ids, jobs=jobs)
if not removed:
logger.info("No unused '%s' cache to remove.", scheme)

if not cloud:
return

cloud_odb = self.cloud.get_remote_odb(remote, "gc -c")
removed = ogc(cloud_odb, used_obj_ids, jobs=jobs)
if removed:
get_index(cloud_odb).clear()
else:
logger.info("No unused cache to remove from remote.")
for remote_odb, obj_ids in odb_to_obj_ids:
if remote_odb is None:
remote_odb = repo.cloud.get_remote_odb(remote, "gc -c")
removed = ogc(remote_odb, obj_ids, jobs=jobs)
if removed:
get_index(remote_odb).clear()
else:
logger.info("No unused cache to remove from remote.")
35 changes: 30 additions & 5 deletions tests/func/test_gc.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,32 @@ def test_gc_not_in_remote_remote_arg(tmp_dir, scm, dvc, tmp_path_factory, mocker
assert len(mocked_remove.mock_calls) == 3


def test_gc_not_in_remote_with_remote_field(
tmp_dir, scm, dvc, tmp_path_factory, mocker
):
storage = os.fspath(tmp_path_factory.mktemp("test_remote_base"))
dvc.config["remote"]["local_remote"] = {"url": storage}
dvc.config["core"]["remote"] = "local_remote"

other_storage = os.fspath(tmp_path_factory.mktemp("test_remote_other"))
dvc.config["remote"]["other_remote"] = {"url": other_storage}

text = textwrap.dedent(
"""\
outs:
- path: foo
remote: other_remote
"""
)
tmp_dir.gen("foo.dvc", text)
tmp_dir.dvc_gen("foo", "foo")
dvc.push()

mocked_remove = mocker.spy(LocalFileSystem, "remove")
dvc.gc(workspace=True, not_in_remote=True)
assert len(mocked_remove.mock_calls) == 1


def test_gc_not_in_remote_cloud(tmp_dir, scm, dvc):
with pytest.raises(
InvalidArgumentError,
Expand All @@ -409,8 +435,7 @@ def test_gc_not_in_remote_cloud(tmp_dir, scm, dvc):
dvc.gc(workspace=True, not_in_remote=True, cloud=True)


@pytest.mark.xfail(reason="--cloud Doesn't respect `remote` field.")
def test_gc_remote_field(tmp_dir, scm, dvc, tmp_path_factory, mocker):
def test_gc_cloud_remote_field(tmp_dir, scm, dvc, tmp_path_factory, mocker):
storage = os.fspath(tmp_path_factory.mktemp("test_remote_base"))
dvc.config["remote"]["local_remote"] = {"url": storage}
dvc.config["core"]["remote"] = "local_remote"
Expand All @@ -425,10 +450,10 @@ def test_gc_remote_field(tmp_dir, scm, dvc, tmp_path_factory, mocker):
"""
)
tmp_dir.gen("foo.dvc", text)
(foo,) = tmp_dir.dvc_gen("foo", "foo")
tmp_dir.dvc_gen("foo", "foo")
dvc.push()
dvc.remove(foo.relpath)
tmp_dir.dvc_gen("foo", "bar")

mocked_remove = mocker.spy(LocalFileSystem, "remove")
dvc.gc(workspace=True, cloud=True)
assert len(mocked_remove.mock_calls) == 2
assert len(mocked_remove.mock_calls) == 2 # local and other_remote

0 comments on commit 64e787f

Please sign in to comment.