Skip to content

Commit

Permalink
remote: use separate working tree instance for local cache (iterative…
Browse files Browse the repository at this point in the history
…#3991)

* dvcignore: ignore paths outside of CleanTree root

* remote: local remote/cache should always use its own working tree

* repo tree (CleanTree) instance should not be relevant to local cache
  operations. When the dvcignore CleanTree is relevant, repo.tree should
  be used. For local remote/cache operations, the local remote work_tree
  should be used.

* state: tie state to local cache working tree

* tests: update tests

* state: use os.path.exists instead of WorkingTree.exists in some cases
  • Loading branch information
pmrowla authored Jun 12, 2020
1 parent 28a123e commit 652f5ab
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 41 deletions.
5 changes: 2 additions & 3 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,16 +186,15 @@ def _parents_exist(self, path):
if path.parent == self.tree_root or Repo.DVC_DIR in path.parts:
return True

# if path is outside of tree, assume this is a local remote/local cache
# link/move operation where we do not need to filter ignores
# paths outside of the CleanTree root should be ignored
path = relpath(path, self.tree_root)
if path.startswith("..") or (
os.name == "nt"
and not os.path.commonprefix(
[os.path.abspath(path), self.tree_root]
)
):
return True
return False

# check if parent directories are in our ignores, starting from
# tree_root
Expand Down
28 changes: 6 additions & 22 deletions dvc/remote/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,12 @@ def repo(self):
return self.remote.repo

@cached_property
def _work_tree(self):
if self.repo:
return WorkingTree(self.repo.root_dir)
return None

@property
def work_tree(self):
# When using repo.brancher, repo.tree may change to/from WorkingTree to
# GitTree arbitarily. When repo.tree is GitTree, local cache needs to
# use its own WorkingTree instance.
if self.repo and not is_working_tree(self.repo.tree):
return self._work_tree
if self.repo:
return WorkingTree(self.repo.root_dir)
return None

@staticmethod
Expand All @@ -72,35 +66,25 @@ def exists(self, path_info):
assert isinstance(path_info, str) or path_info.scheme == "local"
if not self.repo:
return os.path.exists(path_info)
if self.work_tree and self.work_tree.exists(path_info):
return True
return self.repo.tree.exists(path_info)
return self.work_tree.exists(path_info)

def isfile(self, path_info):
if not self.repo:
return os.path.isfile(path_info)
if self.work_tree and self.work_tree.isfile(path_info):
return True
return self.repo.tree.isfile(path_info)
return self.work_tree.isfile(path_info)

def isdir(self, path_info):
if not self.repo:
return os.path.isdir(path_info)
if self.work_tree and self.work_tree.isdir(path_info):
return True
return self.repo.tree.isdir(path_info)
return self.work_tree.isdir(path_info)

def iscopy(self, path_info):
return not (
System.is_symlink(path_info) or System.is_hardlink(path_info)
)

def walk_files(self, path_info, **kwargs):
if self.work_tree:
tree = self.work_tree
else:
tree = self.repo.tree
for fname in tree.walk_files(path_info):
for fname in self.work_tree.walk_files(path_info):
yield PathInfo(fname)

def is_empty(self, path_info):
Expand Down
8 changes: 4 additions & 4 deletions dvc/repo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,14 +111,14 @@ def __init__(self, root_dir=None, scm=None, rev=None):
friendly=True,
)

self.cache = Cache(self)
self.cloud = DataCloud(self)

if not scm:
# NOTE: storing state and link_state in the repository itself to
# avoid any possible state corruption in 'shared cache dir'
# scenario.
self.state = State(self)

self.cache = Cache(self)
self.cloud = DataCloud(self)
self.state = State(self.cache.local)

self.stage_cache = StageCache(self)

Expand Down
13 changes: 8 additions & 5 deletions dvc/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from urllib.parse import urlencode, urlunparse

from dvc.exceptions import DvcException
from dvc.scm.tree import WorkingTree
from dvc.utils import current_timestamp, relpath, to_chunks
from dvc.utils.fs import get_inode, get_mtime_and_size, remove

Expand Down Expand Up @@ -89,10 +88,11 @@ class State: # pylint: disable=too-many-instance-attributes
MAX_INT = 2 ** 63 - 1
MAX_UINT = 2 ** 64 - 2

def __init__(self, repo):
def __init__(self, local_cache):
repo = local_cache.repo
self.repo = repo
self.root_dir = repo.root_dir
self.tree = WorkingTree(self.root_dir)
self.tree = local_cache.tree.work_tree

state_config = repo.config.get("state", {})
self.row_limit = state_config.get("row_limit", self.STATE_ROW_LIMIT)
Expand Down Expand Up @@ -394,6 +394,9 @@ def get(self, path_info):
assert isinstance(path_info, str) or path_info.scheme == "local"
path = os.fspath(path_info)

# NOTE: use os.path.exists instead of WorkingTree.exists
# WorkingTree.exists uses lexists() and will return True for broken
# symlinks that we cannot stat() in get_mtime_and_size
if not os.path.exists(path):
return None

Expand All @@ -420,7 +423,7 @@ def save_link(self, path_info):
"""
assert isinstance(path_info, str) or path_info.scheme == "local"

if not os.path.exists(path_info):
if not self.tree.exists(path_info):
return

mtime, _ = get_mtime_and_size(path_info, self.tree)
Expand All @@ -446,7 +449,7 @@ def get_unused_links(self, used):
inode = self._from_sqlite(inode)
path = os.path.join(self.root_dir, relpath)

if path in used or not os.path.exists(path):
if path in used or not self.tree.exists(path):
continue

actual_inode = get_inode(path)
Expand Down
7 changes: 2 additions & 5 deletions tests/func/test_ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
DvcIgnorePatterns,
DvcIgnoreRepo,
)
from dvc.remote import LocalRemote
from dvc.repo import Repo
from dvc.scm.tree import WorkingTree
from dvc.utils import relpath
Expand Down Expand Up @@ -139,8 +138,7 @@ def test_match_nested(tmp_dir, dvc):
}
)

remote = LocalRemote(dvc, {})
result = {os.fspath(f) for f in remote.tree.walk_files(".")}
result = {os.fspath(os.path.normpath(f)) for f in dvc.tree.walk_files(".")}
assert result == {".dvcignore", "foo"}


Expand All @@ -149,8 +147,7 @@ def test_ignore_external(tmp_dir, scm, dvc, tmp_path_factory):
ext_dir = TmpDir(os.fspath(tmp_path_factory.mktemp("external_dir")))
ext_dir.gen({"y.backup": "y", "tmp": "ext tmp"})

remote = LocalRemote(dvc, {})
result = {relpath(f, ext_dir) for f in remote.tree.walk_files(ext_dir)}
result = {relpath(f, ext_dir) for f in dvc.tree.walk_files(ext_dir)}
assert result == {"y.backup", "tmp"}


Expand Down
4 changes: 2 additions & 2 deletions tests/func/test_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def test_state(tmp_dir, dvc):
path_info = PathInfo(path)
md5 = file_md5(path)[0]

state = State(dvc)
state = State(dvc.cache.local)

with state:
state.save(path_info, md5)
Expand Down Expand Up @@ -57,7 +57,7 @@ def get_inode_mocked(path):
def test_get_state_record_for_inode(get_inode_mock, tmp_dir, dvc):
tmp_dir.gen("foo", "foo content")

state = State(dvc)
state = State(dvc.cache.local)
inode = state.MAX_INT + 2
assert inode != state._to_sqlite(inode)

Expand Down

0 comments on commit 652f5ab

Please sign in to comment.