From 652f5ab439d2d72f3b3ddabd4f38f117762348d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20Rowlands=20=28=EB=B3=80=EA=B8=B0=ED=98=B8=29?= Date: Fri, 12 Jun 2020 18:52:38 +0900 Subject: [PATCH] remote: use separate working tree instance for local cache (#3991) * dvcignore: ignore paths outside of CleanTree root * remote: local remote/cache should always use its own working tree * repo tree (CleanTree) instance should not be relevant to local cache operations. When the dvcignore CleanTree is relevant, repo.tree should be used. For local remote/cache operations, the local remote work_tree should be used. * state: tie state to local cache working tree * tests: update tests * state: use os.path.exists instead of WorkingTree.exists in some cases --- dvc/ignore.py | 5 ++--- dvc/remote/local.py | 28 ++++++---------------------- dvc/repo/__init__.py | 8 ++++---- dvc/state.py | 13 ++++++++----- tests/func/test_ignore.py | 7 ++----- tests/func/test_state.py | 4 ++-- 6 files changed, 24 insertions(+), 41 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 7ef67763bd..b7a59e8f0f 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -186,8 +186,7 @@ def _parents_exist(self, path): if path.parent == self.tree_root or Repo.DVC_DIR in path.parts: return True - # if path is outside of tree, assume this is a local remote/local cache - # link/move operation where we do not need to filter ignores + # paths outside of the CleanTree root should be ignored path = relpath(path, self.tree_root) if path.startswith("..") or ( os.name == "nt" @@ -195,7 +194,7 @@ def _parents_exist(self, path): [os.path.abspath(path), self.tree_root] ) ): - return True + return False # check if parent directories are in our ignores, starting from # tree_root diff --git a/dvc/remote/local.py b/dvc/remote/local.py index d3a49a8b50..1b52f1f233 100644 --- a/dvc/remote/local.py +++ b/dvc/remote/local.py @@ -50,18 +50,12 @@ def repo(self): return self.remote.repo @cached_property - def _work_tree(self): - if self.repo: - return WorkingTree(self.repo.root_dir) - return None - - @property def work_tree(self): # When using repo.brancher, repo.tree may change to/from WorkingTree to # GitTree arbitarily. When repo.tree is GitTree, local cache needs to # use its own WorkingTree instance. - if self.repo and not is_working_tree(self.repo.tree): - return self._work_tree + if self.repo: + return WorkingTree(self.repo.root_dir) return None @staticmethod @@ -72,23 +66,17 @@ def exists(self, path_info): assert isinstance(path_info, str) or path_info.scheme == "local" if not self.repo: return os.path.exists(path_info) - if self.work_tree and self.work_tree.exists(path_info): - return True - return self.repo.tree.exists(path_info) + return self.work_tree.exists(path_info) def isfile(self, path_info): if not self.repo: return os.path.isfile(path_info) - if self.work_tree and self.work_tree.isfile(path_info): - return True - return self.repo.tree.isfile(path_info) + return self.work_tree.isfile(path_info) def isdir(self, path_info): if not self.repo: return os.path.isdir(path_info) - if self.work_tree and self.work_tree.isdir(path_info): - return True - return self.repo.tree.isdir(path_info) + return self.work_tree.isdir(path_info) def iscopy(self, path_info): return not ( @@ -96,11 +84,7 @@ def iscopy(self, path_info): ) def walk_files(self, path_info, **kwargs): - if self.work_tree: - tree = self.work_tree - else: - tree = self.repo.tree - for fname in tree.walk_files(path_info): + for fname in self.work_tree.walk_files(path_info): yield PathInfo(fname) def is_empty(self, path_info): diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index dce0187784..dd0b1528ac 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -111,14 +111,14 @@ def __init__(self, root_dir=None, scm=None, rev=None): friendly=True, ) + self.cache = Cache(self) + self.cloud = DataCloud(self) + if not scm: # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. - self.state = State(self) - - self.cache = Cache(self) - self.cloud = DataCloud(self) + self.state = State(self.cache.local) self.stage_cache = StageCache(self) diff --git a/dvc/state.py b/dvc/state.py index ba22d614e1..fd3924f932 100644 --- a/dvc/state.py +++ b/dvc/state.py @@ -7,7 +7,6 @@ from urllib.parse import urlencode, urlunparse from dvc.exceptions import DvcException -from dvc.scm.tree import WorkingTree from dvc.utils import current_timestamp, relpath, to_chunks from dvc.utils.fs import get_inode, get_mtime_and_size, remove @@ -89,10 +88,11 @@ class State: # pylint: disable=too-many-instance-attributes MAX_INT = 2 ** 63 - 1 MAX_UINT = 2 ** 64 - 2 - def __init__(self, repo): + def __init__(self, local_cache): + repo = local_cache.repo self.repo = repo self.root_dir = repo.root_dir - self.tree = WorkingTree(self.root_dir) + self.tree = local_cache.tree.work_tree state_config = repo.config.get("state", {}) self.row_limit = state_config.get("row_limit", self.STATE_ROW_LIMIT) @@ -394,6 +394,9 @@ def get(self, path_info): assert isinstance(path_info, str) or path_info.scheme == "local" path = os.fspath(path_info) + # NOTE: use os.path.exists instead of WorkingTree.exists + # WorkingTree.exists uses lexists() and will return True for broken + # symlinks that we cannot stat() in get_mtime_and_size if not os.path.exists(path): return None @@ -420,7 +423,7 @@ def save_link(self, path_info): """ assert isinstance(path_info, str) or path_info.scheme == "local" - if not os.path.exists(path_info): + if not self.tree.exists(path_info): return mtime, _ = get_mtime_and_size(path_info, self.tree) @@ -446,7 +449,7 @@ def get_unused_links(self, used): inode = self._from_sqlite(inode) path = os.path.join(self.root_dir, relpath) - if path in used or not os.path.exists(path): + if path in used or not self.tree.exists(path): continue actual_inode = get_inode(path) diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index a0c648932b..d3a64435f9 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -10,7 +10,6 @@ DvcIgnorePatterns, DvcIgnoreRepo, ) -from dvc.remote import LocalRemote from dvc.repo import Repo from dvc.scm.tree import WorkingTree from dvc.utils import relpath @@ -139,8 +138,7 @@ def test_match_nested(tmp_dir, dvc): } ) - remote = LocalRemote(dvc, {}) - result = {os.fspath(f) for f in remote.tree.walk_files(".")} + result = {os.fspath(os.path.normpath(f)) for f in dvc.tree.walk_files(".")} assert result == {".dvcignore", "foo"} @@ -149,8 +147,7 @@ def test_ignore_external(tmp_dir, scm, dvc, tmp_path_factory): ext_dir = TmpDir(os.fspath(tmp_path_factory.mktemp("external_dir"))) ext_dir.gen({"y.backup": "y", "tmp": "ext tmp"}) - remote = LocalRemote(dvc, {}) - result = {relpath(f, ext_dir) for f in remote.tree.walk_files(ext_dir)} + result = {relpath(f, ext_dir) for f in dvc.tree.walk_files(ext_dir)} assert result == {"y.backup", "tmp"} diff --git a/tests/func/test_state.py b/tests/func/test_state.py index a36fbf6f84..616faf672f 100644 --- a/tests/func/test_state.py +++ b/tests/func/test_state.py @@ -13,7 +13,7 @@ def test_state(tmp_dir, dvc): path_info = PathInfo(path) md5 = file_md5(path)[0] - state = State(dvc) + state = State(dvc.cache.local) with state: state.save(path_info, md5) @@ -57,7 +57,7 @@ def get_inode_mocked(path): def test_get_state_record_for_inode(get_inode_mock, tmp_dir, dvc): tmp_dir.gen("foo", "foo content") - state = State(dvc) + state = State(dvc.cache.local) inode = state.MAX_INT + 2 assert inode != state._to_sqlite(inode)