Skip to content

Commit

Permalink
remote: separate cloud remote and cloud cache classes (iterative#4019)
Browse files Browse the repository at this point in the history
* remote: move get_file_checksum() into tree

* makes RemoteTree and RepoTree consistent with regard to checksum
  calculation

* remote: save() now takes explicit tree parameter

* when tree is remote.tree, save will be a move + link operation
  (same as default existing behavior)
* when saving path from a different tree, save will be a copy operation

* tests: update for moved remote/tree functions

* remote: move get_checksum into tree

* remote: separate cloud remote and cache classes

* tests: update unit tests for remote/cache separation

* remote: cloud cache should extend cloud remote

* remote: move LocalRemote.get

* tests: update func tests

* dependency: update for moved get_checksum

* remote: fix state lookup bug
  • Loading branch information
pmrowla authored Jun 12, 2020
1 parent 652f5ab commit 9ead641
Show file tree
Hide file tree
Showing 25 changed files with 886 additions and 849 deletions.
8 changes: 4 additions & 4 deletions dvc/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@ def _make_remote_property(name):
"""

def getter(self):
from dvc.remote import Remote
from dvc.remote import Cache as CloudCache

remote = self.config.get(name)
if not remote:
return None

return Remote(self.repo, name=remote)
return CloudCache(self.repo, name=remote)

getter.__name__ = name
return cached_property(getter)
Expand All @@ -50,7 +50,7 @@ class Cache:
CACHE_DIR = "cache"

def __init__(self, repo):
from dvc.remote import Remote
from dvc.remote import Cache as CloudCache

self.repo = repo
self.config = config = repo.config["cache"]
Expand All @@ -62,7 +62,7 @@ def __init__(self, repo):
else:
settings = {**config, "url": config["dir"]}

self.local = Remote(repo, **settings)
self.local = CloudCache(repo, **settings)

s3 = _make_remote_property("s3")
gs = _make_remote_property("gs")
Expand Down
4 changes: 1 addition & 3 deletions dvc/dependency/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,7 @@ def _get_checksum(self, locked=True):

# We are polluting our repo cache with some dir listing here
if tree.isdir(path):
return self.repo.cache.local.get_dir_checksum(
path, tree=tree
)
return self.repo.cache.local.get_checksum(path, tree)
return tree.get_file_checksum(path)

def status(self):
Expand Down
3 changes: 2 additions & 1 deletion dvc/external_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,9 @@ def download_update(result):
raise PathMissingError(path, self.url)
save_info = self.local_cache.save(
path,
self.repo_tree,
None,
tree=self.repo_tree,
save_link=False,
download_callback=download_update,
)
save_infos.append(save_info)
Expand Down
2 changes: 1 addition & 1 deletion dvc/output/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def save(self):

def commit(self):
if self.use_cache:
self.cache.save(self.path_info, self.info)
self.cache.save(self.path_info, self.cache.tree, self.info)

def dumpd(self):
ret = copy(self.info)
Expand Down
42 changes: 30 additions & 12 deletions dvc/remote/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,25 @@
import posixpath
from urllib.parse import urlparse

from dvc.remote.azure import AzureRemote
from dvc.remote.azure import AzureCache, AzureRemote
from dvc.remote.gdrive import GDriveRemote
from dvc.remote.gs import GSRemote
from dvc.remote.hdfs import HDFSRemote
from dvc.remote.gs import GSCache, GSRemote
from dvc.remote.hdfs import HDFSCache, HDFSRemote
from dvc.remote.http import HTTPRemote
from dvc.remote.https import HTTPSRemote
from dvc.remote.local import LocalRemote
from dvc.remote.local import LocalCache, LocalRemote
from dvc.remote.oss import OSSRemote
from dvc.remote.s3 import S3Remote
from dvc.remote.ssh import SSHRemote
from dvc.remote.s3 import S3Cache, S3Remote
from dvc.remote.ssh import SSHCache, SSHRemote

CACHES = [
AzureCache,
GSCache,
HDFSCache,
S3Cache,
SSHCache,
# LocalCache is the default
]

REMOTES = [
AzureRemote,
Expand All @@ -26,21 +35,30 @@
]


def _get(remote_conf):
for remote in REMOTES:
def _get(remote_conf, remotes, default):
for remote in remotes:
if remote.supported(remote_conf):
return remote
return LocalRemote
return default


def Remote(repo, **kwargs):
def _get_conf(repo, **kwargs):
name = kwargs.get("name")
if name:
remote_conf = repo.config["remote"][name.lower()]
else:
remote_conf = kwargs
remote_conf = _resolve_remote_refs(repo.config, remote_conf)
return _get(remote_conf)(repo, remote_conf)
return _resolve_remote_refs(repo.config, remote_conf)


def Remote(repo, **kwargs):
remote_conf = _get_conf(repo, **kwargs)
return _get(remote_conf, REMOTES, LocalRemote)(repo, remote_conf)


def Cache(repo, **kwargs):
remote_conf = _get_conf(repo, **kwargs)
return _get(remote_conf, CACHES, LocalCache)(repo, remote_conf)


def _resolve_remote_refs(config, remote_conf):
Expand Down
12 changes: 8 additions & 4 deletions dvc/remote/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from dvc.path_info import CloudURLInfo
from dvc.progress import Tqdm
from dvc.remote.base import BaseRemote, BaseRemoteTree
from dvc.remote.base import BaseRemote, BaseRemoteTree, CacheMixin
from dvc.scheme import Schemes

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -108,6 +108,9 @@ def remove(self, path_info):
logger.debug(f"Removing {path_info}")
self.blob_service.delete_blob(path_info.bucket, path_info.path)

def get_file_checksum(self, path_info):
return self.get_etag(path_info)

def _upload(
self, from_file, to_info, name=None, no_progress_bar=False, **_kwargs
):
Expand All @@ -134,10 +137,11 @@ def _download(
class AzureRemote(BaseRemote):
scheme = Schemes.AZURE
REQUIRES = {"azure-storage-blob": "azure.storage.blob"}
TREE_CLS = AzureRemoteTree
PARAM_CHECKSUM = "etag"
COPY_POLL_SECONDS = 5
LIST_OBJECT_PAGE_SIZE = 5000
TREE_CLS = AzureRemoteTree

def get_file_checksum(self, path_info):
return self.tree.get_etag(path_info)

class AzureCache(AzureRemote, CacheMixin):
pass
Loading

0 comments on commit 9ead641

Please sign in to comment.