Skip to content

Commit

Permalink
status: optimization for already downloaded files
Browse files Browse the repository at this point in the history
  • Loading branch information
Mr. Outis committed May 1, 2019
1 parent e99fc2d commit 29f58bd
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 6 deletions.
4 changes: 3 additions & 1 deletion dvc/data_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,10 @@ def __init__(self, repo, config=None):

@property
def _cloud(self):
"""Returns a Remote instance using the `core.remote` config"""
remote = self._core.get(Config.SECTION_CORE_REMOTE, "")
if remote != "":

if remote:
return self._init_remote(remote)

if self._core.get(Config.SECTION_CORE_CLOUD, None):
Expand Down
30 changes: 25 additions & 5 deletions dvc/remote/local/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,14 @@ def _group(self, checksum_infos, show_checksums=False):

return by_md5

def status(self, checksum_infos, remote, jobs=None, show_checksums=False):
def status(
self,
checksum_infos,
remote,
jobs=None,
show_checksums=False,
download=False,
):
logger.info("Preparing to collect status from {}".format(remote.url))
title = "Collecting information"

Expand All @@ -352,11 +359,20 @@ def status(self, checksum_infos, remote, jobs=None, show_checksums=False):

progress.update_target(title, 30, 100)

remote_exists = list(remote.cache_exists(md5s))
local_exists = self.cache_exists(md5s)

progress.update_target(title, 90, 100)
progress.update_target(title, 40, 100)

local_exists = self.cache_exists(md5s)
# This is a performance optimization. We can safely assume that,
# if the resources that we want to fetch are already cached,
# there's no need to check the remote storage for the existance of
# those files.
if download and sorted(local_exists) == sorted(md5s):
remote_exists = local_exists
else:
remote_exists = list(remote.cache_exists(md5s))

progress.update_target(title, 90, 100)

progress.finish_target(title)

Expand Down Expand Up @@ -436,7 +452,11 @@ def _process(
jobs = remote.JOBS

status_info = self.status(
checksum_infos, remote, jobs=jobs, show_checksums=show_checksums
checksum_infos,
remote,
jobs=jobs,
show_checksums=show_checksums,
download=download,
)

chunks = self._get_chunks(download, remote, status_info, status, jobs)
Expand Down
41 changes: 41 additions & 0 deletions tests/unit/remote/test_local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from dvc.remote.local import RemoteLOCAL


def test_status_download_optimization(mocker):
"""When comparing the status to pull a remote cache,
And the desired files to fetch are already on the local cache,
Don't check the existance of the desired files on the remote cache
"""
remote = RemoteLOCAL(None, {})

checksum_infos = [
{
"path": "foo",
"metric": False,
"cache": True,
"persist": False,
"md5": "acbd18db4cc2f85cedef654fccc4a4d8",
},
{
"path": "bar",
"metric": False,
"cache": True,
"persist": False,
"md5": "37b51d194a7513e45b56f6524f2d51f2",
},
]

local_exists = [
"acbd18db4cc2f85cedef654fccc4a4d8",
"37b51d194a7513e45b56f6524f2d51f2",
]

mocker.patch.object(remote, "cache_exists", return_value=local_exists)

other_remote = mocker.Mock()
other_remote.url = "other_remote"
other_remote.cache_exists.return_value = []

remote.status(checksum_infos, other_remote, download=True)

assert other_remote.cache_exists.call_count == 0

0 comments on commit 29f58bd

Please sign in to comment.