diff --git a/dvc/data_cloud.py b/dvc/data_cloud.py index e5f7f0189c..f9704336b9 100644 --- a/dvc/data_cloud.py +++ b/dvc/data_cloud.py @@ -48,8 +48,10 @@ def __init__(self, repo, config=None): @property def _cloud(self): + """Returns a Remote instance using the `core.remote` config""" remote = self._core.get(Config.SECTION_CORE_REMOTE, "") - if remote != "": + + if remote: return self._init_remote(remote) if self._core.get(Config.SECTION_CORE_CLOUD, None): diff --git a/dvc/remote/local/__init__.py b/dvc/remote/local/__init__.py index cd5633b74c..311309261b 100644 --- a/dvc/remote/local/__init__.py +++ b/dvc/remote/local/__init__.py @@ -336,7 +336,14 @@ def _group(self, checksum_infos, show_checksums=False): return by_md5 - def status(self, checksum_infos, remote, jobs=None, show_checksums=False): + def status( + self, + checksum_infos, + remote, + jobs=None, + show_checksums=False, + download=False, + ): logger.info("Preparing to collect status from {}".format(remote.url)) title = "Collecting information" @@ -352,11 +359,20 @@ def status(self, checksum_infos, remote, jobs=None, show_checksums=False): progress.update_target(title, 30, 100) - remote_exists = list(remote.cache_exists(md5s)) + local_exists = self.cache_exists(md5s) - progress.update_target(title, 90, 100) + progress.update_target(title, 40, 100) - local_exists = self.cache_exists(md5s) + # This is a performance optimization. We can safely assume that, + # if the resources that we want to fetch are already cached, + # there's no need to check the remote storage for the existance of + # those files. + if download and sorted(local_exists) == sorted(md5s): + remote_exists = local_exists + else: + remote_exists = list(remote.cache_exists(md5s)) + + progress.update_target(title, 90, 100) progress.finish_target(title) @@ -436,7 +452,11 @@ def _process( jobs = remote.JOBS status_info = self.status( - checksum_infos, remote, jobs=jobs, show_checksums=show_checksums + checksum_infos, + remote, + jobs=jobs, + show_checksums=show_checksums, + download=download, ) chunks = self._get_chunks(download, remote, status_info, status, jobs) diff --git a/tests/unit/remote/test_local.py b/tests/unit/remote/test_local.py new file mode 100644 index 0000000000..dd547595e4 --- /dev/null +++ b/tests/unit/remote/test_local.py @@ -0,0 +1,41 @@ +from dvc.remote.local import RemoteLOCAL + + +def test_status_download_optimization(mocker): + """When comparing the status to pull a remote cache, + And the desired files to fetch are already on the local cache, + Don't check the existance of the desired files on the remote cache + """ + remote = RemoteLOCAL(None, {}) + + checksum_infos = [ + { + "path": "foo", + "metric": False, + "cache": True, + "persist": False, + "md5": "acbd18db4cc2f85cedef654fccc4a4d8", + }, + { + "path": "bar", + "metric": False, + "cache": True, + "persist": False, + "md5": "37b51d194a7513e45b56f6524f2d51f2", + }, + ] + + local_exists = [ + "acbd18db4cc2f85cedef654fccc4a4d8", + "37b51d194a7513e45b56f6524f2d51f2", + ] + + mocker.patch.object(remote, "cache_exists", return_value=local_exists) + + other_remote = mocker.Mock() + other_remote.url = "other_remote" + other_remote.cache_exists.return_value = [] + + remote.status(checksum_infos, other_remote, download=True) + + assert other_remote.cache_exists.call_count == 0