Skip to content

Commit

Permalink
data cloud: pull: trust remotes md5 on proper flag
Browse files Browse the repository at this point in the history
  • Loading branch information
pared committed Jan 22, 2020
1 parent d942542 commit 0b1d1f5
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 4 deletions.
7 changes: 7 additions & 0 deletions dvc/command/data_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def run(self):
all_tags=self.args.all_tags,
with_deps=self.args.with_deps,
recursive=self.args.recursive,
trust_remote=self.args.trust_remote,
)
except DvcException:
logger.exception("failed to fetch data from the cloud")
Expand Down Expand Up @@ -231,6 +232,12 @@ def add_parser(subparsers, _parent_parser):
default=False,
help="Fetch cache for subdirectories of specified directory.",
)
fetch_parser.add_argument(
"--trust-remote",
action="store_true",
default=False,
help="Trust remote cache checksums upon fetching.",
)
fetch_parser.set_defaults(func=CmdDataFetch)

# Status
Expand Down
19 changes: 17 additions & 2 deletions dvc/data_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,14 @@ def push(self, cache, jobs=None, remote=None, show_checksums=False):
show_checksums=show_checksums,
)

def pull(self, cache, jobs=None, remote=None, show_checksums=False):
def pull(
self,
cache,
jobs=None,
remote=None,
show_checksums=False,
trust_remote=False,
):
"""Pull data items in a cloud-agnostic way.
Args:
Expand All @@ -73,12 +80,20 @@ def pull(self, cache, jobs=None, remote=None, show_checksums=False):
show_checksums (bool): show checksums instead of file names in
information messages.
"""
return self.repo.cache.local.pull(
downloaded_items_num = self.repo.cache.local.pull(
cache,
jobs=jobs,
remote=self.get_remote(remote, "pull"),
show_checksums=show_checksums,
)
if trust_remote:
for checksum in cache["local"].keys():
cache_file = self.repo.cache.local.checksum_to_path_info(
checksum
)
if self.repo.cache.local.exists(cache_file):
self.repo.state.save(cache_file, checksum)
return downloaded_items_num

def status(self, cache, jobs=None, remote=None, show_checksums=False):
"""Check status of data items in a cloud-agnostic way.
Expand Down
8 changes: 7 additions & 1 deletion dvc/output/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,12 @@ def unprotect(self):
self.remote.unprotect(self.path_info)

def _collect_used_dir_cache(
self, remote=None, force=False, jobs=None, filter_info=None
self,
remote=None,
force=False,
jobs=None,
filter_info=None,
trust_remote=False,
):
"""Get a list of `info`s related to the given directory.
Expand Down Expand Up @@ -370,6 +375,7 @@ def _collect_used_dir_cache(
jobs=jobs,
remote=remote,
show_checksums=False,
trust_remote=trust_remote,
)
except DvcException:
logger.debug("failed to pull cache for '{}'".format(self))
Expand Down
2 changes: 2 additions & 0 deletions dvc/repo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ def used_cache(
force=False,
jobs=None,
recursive=False,
trust_remote=False,
):
"""Get the stages related to the given target and collect
the `info` of its outputs.
Expand Down Expand Up @@ -262,6 +263,7 @@ def used_cache(
force=force,
jobs=jobs,
filter_info=filter_info,
trust_remote=trust_remote,
)
cache.update(used_cache, suffix=suffix)

Expand Down
8 changes: 7 additions & 1 deletion dvc/repo/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def _fetch(
with_deps=False,
all_tags=False,
recursive=False,
trust_remote=False,
):
"""Download data items from a cloud and imported repositories
Expand All @@ -42,14 +43,19 @@ def _fetch(
remote=remote,
jobs=jobs,
recursive=recursive,
trust_remote=trust_remote,
)

downloaded = 0
failed = 0

try:
downloaded += self.cloud.pull(
used, jobs, remote=remote, show_checksums=show_checksums
used,
jobs,
remote=remote,
show_checksums=show_checksums,
trust_remote=trust_remote,
)
except NoRemoteError:
if not used.external and used["local"]:
Expand Down
2 changes: 2 additions & 0 deletions dvc/repo/pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def pull(
all_tags=False,
force=False,
recursive=False,
trust_remote=False,
):
processed_files_count = self._fetch(
targets,
Expand All @@ -26,6 +27,7 @@ def pull(
all_tags=all_tags,
with_deps=with_deps,
recursive=recursive,
trust_remote=trust_remote,
)
self._checkout(
targets=targets, with_deps=with_deps, force=force, recursive=recursive
Expand Down
71 changes: 71 additions & 0 deletions tests/func/test_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@
import pytest
from mock import patch

from dvc import remote
from dvc.config import Config, ConfigError
from dvc.exceptions import DownloadError, UploadError
from dvc.main import main
from dvc.path_info import PathInfo
from dvc.remote import RemoteLOCAL, RemoteConfig
from dvc.remote.base import RemoteBASE
from dvc.compat import fspath
from dvc.scm import Git
from dvc.utils import file_md5
from tests.basic_env import TestDvc
from tests.remotes import Local

Expand Down Expand Up @@ -257,3 +260,71 @@ def test_modify_missing_remote(dvc):

with pytest.raises(ConfigError, match=r"unable to find remote section"):
remote_config.modify("myremote", "gdrive_client_id", "xxx")


def test_trust_remote_checksums(tmp_dir, mocker, tmp_path_factory, erepo_dir):
with erepo_dir.chdir():

erepo_dir.dvc_gen({"file": "file content"}, commit="add dir")

Git.clone(fspath(erepo_dir), fspath(tmp_dir))

from dvc.repo import Repo

dvc = Repo(fspath(tmp_dir))

from tests.utils import spy

file_md5_spy = spy(file_md5)
mocker.patch.object(remote.local, "file_md5", file_md5_spy)

dvc.pull(trust_remote=True)

assert len(file_md5_spy.mock.call_args_list) == 1


def test_trust_remote_checksums_dir(
tmp_dir, mocker, tmp_path_factory, erepo_dir
):
with erepo_dir.chdir():
erepo_dir.dvc_gen({"dir": {"file": "file content"}}, commit="add dir")

Git.clone(fspath(erepo_dir), fspath(tmp_dir))

from dvc.repo import Repo

dvc = Repo(fspath(tmp_dir))

from tests.utils import spy

file_md5_spy = spy(file_md5)
mocker.patch.object(remote.local, "file_md5", file_md5_spy)

dvc.pull(trust_remote=True)

assert len(file_md5_spy.mock.call_args_list) == 1


def test_trust_remote_checksums_external_dep(
tmp_dir, mocker, tmp_path_factory, erepo_dir
):
external_path = tmp_path_factory.mktemp("external_dep") / "file"
external_path.write_text("file content")

with erepo_dir.chdir():
erepo_dir.dvc_add(fspath(external_path), commit="add external file")

Git.clone(fspath(erepo_dir), fspath(tmp_dir))

from dvc.repo import Repo

dvc = Repo(fspath(tmp_dir))

from tests.utils import spy

file_md5_spy = spy(file_md5)
mocker.patch.object(remote.local, "file_md5", file_md5_spy)

dvc.pull(trust_remote=True)

assert len(file_md5_spy.mock.call_args_list) == 1

0 comments on commit 0b1d1f5

Please sign in to comment.