Skip to content

Commit

Permalink
optimize local status (iterative#3867)
Browse files Browse the repository at this point in the history
* remote: remove unpacked dir

We've done a lot of optimizations lately, which made unpacked dir trick
obsoleted.

* local: save us a stat call in is_protected

* local: speedup checksum_to_path ~x5.5

* tree: walk: don't use os.path.join

It is ~5.5 times slower than joining by hand.

* dvcignore: optimize matching x10

On a repo with a dvcignore with 1 pattern and a directory with 400K
files, `dvc status` now takes ~8 sec instead of ~30 sec.

To achieve that, we make some assumptions about the paths formats that
we are dealing with, so we could use simpler logic instead of using
very slow `relpath`, `abspath` etc on every entry in a directory.

It is also clear that CleanTree behavior is inconsistent (even tests
expect very different outputs from it), so we will need to look into
this later.
  • Loading branch information
efiop authored May 25, 2020
1 parent c5920b0 commit d77af81
Show file tree
Hide file tree
Showing 7 changed files with 32 additions and 158 deletions.
22 changes: 15 additions & 7 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from pathspec.patterns import GitWildMatchPattern

from dvc.scm.tree import BaseTree
from dvc.utils import relpath

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -35,12 +34,19 @@ def __call__(self, root, dirs, files):
return dirs, files

def matches(self, dirname, basename):
abs_path = os.path.join(dirname, basename)
rel_path = relpath(abs_path, self.dirname)

if os.pardir + os.sep in rel_path:
# NOTE: `relpath` is too slow, so we have to assume that both
# `dirname` and `self.dirname` are relative or absolute together.
prefix = self.dirname + os.sep
if dirname == self.dirname:
path = basename
elif dirname.startswith(prefix):
rel = dirname[len(prefix) :]
# NOTE: `os.path.join` is ~x5.5 slower
path = f"{rel}{os.sep}{basename}"
else:
return False
return self.ignore_spec.match_file(rel_path)

return self.ignore_spec.match_file(path)

def __hash__(self):
return hash(self.ignore_file_path)
Expand Down Expand Up @@ -135,7 +141,9 @@ def isexec(self, path):

def walk(self, top, topdown=True):
for root, dirs, files in self.tree.walk(top, topdown):
dirs[:], files[:] = self.dvcignore(root, dirs, files)
dirs[:], files[:] = self.dvcignore(
os.path.abspath(root), dirs, files
)

yield root, dirs, files

Expand Down
25 changes: 2 additions & 23 deletions dvc/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,7 +831,7 @@ def all(self, jobs=None, name=None):

@index_locked
def gc(self, named_cache, jobs=None):
used = self.extract_used_local_checksums(named_cache)
used = set(named_cache.scheme_keys("local"))

if self.scheme != "":
used.update(named_cache.scheme_keys(self.scheme))
Expand All @@ -847,6 +847,7 @@ def gc(self, named_cache, jobs=None):
continue
path_info = self.checksum_to_path_info(checksum)
if self.is_dir_checksum(checksum):
# backward compatibility
self._remove_unpacked_dir(checksum)
self.remove(path_info)
removed = True
Expand Down Expand Up @@ -905,11 +906,6 @@ def _changed_dir_cache(self, checksum, path_info=None, filter_info=None):
if self.changed_cache_file(checksum):
return True

if not (path_info and filter_info) and not self._changed_unpacked_dir(
checksum
):
return False

for entry in self.get_dir_cache(checksum):
entry_checksum = entry[self.PARAM_CHECKSUM]

Expand All @@ -921,9 +917,6 @@ def _changed_dir_cache(self, checksum, path_info=None, filter_info=None):
if self.changed_cache_file(entry_checksum):
return True

if not (path_info and filter_info):
self._update_unpacked_dir(checksum)

return False

def changed_cache(self, checksum, path_info=None, filter_info=None):
Expand Down Expand Up @@ -1362,19 +1355,5 @@ def get_files_number(self, path_info, checksum, filter_info):
def unprotect(path_info):
pass

def _get_unpacked_dir_names(self, checksums):
return set()

def extract_used_local_checksums(self, named_cache):
used = set(named_cache.scheme_keys("local"))
unpacked = self._get_unpacked_dir_names(used)
return used | unpacked

def _changed_unpacked_dir(self, checksum):
return True

def _update_unpacked_dir(self, checksum):
pass

def _remove_unpacked_dir(self, checksum):
pass
72 changes: 11 additions & 61 deletions dvc/remote/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,12 @@ def cache_path(self):
return os.path.abspath(self.cache_dir)

def checksum_to_path(self, checksum):
return os.path.join(self.cache_path, checksum[0:2], checksum[2:])
# NOTE: `self.cache_path` is already normalized so we can simply use
# `os.sep` instead of `os.path.join`. This results in this helper
# being ~5.5 times faster.
return (
f"{self.cache_path}{os.sep}{checksum[0:2]}{os.sep}{checksum[2:]}"
)

def list_cache_paths(self, prefix=None, progress_callback=None):
assert self.path_info is not None
Expand Down Expand Up @@ -695,70 +700,15 @@ def protect(self, path_info):
if actual != mode:
raise

def _get_unpacked_dir_path_info(self, checksum):
info = self.checksum_to_path_info(checksum)
return info.with_name(info.name + self.UNPACKED_DIR_SUFFIX)

def _remove_unpacked_dir(self, checksum):
path_info = self._get_unpacked_dir_path_info(checksum)
info = self.checksum_to_path_info(checksum)
path_info = info.with_name(info.name + self.UNPACKED_DIR_SUFFIX)
self.remove(path_info)

def _path_info_changed(self, path_info):
if self.exists(path_info) and self.state.get(path_info):
return False
return True

def _update_unpacked_dir(self, checksum):
unpacked_dir_info = self._get_unpacked_dir_path_info(checksum)

if not self._path_info_changed(unpacked_dir_info):
return

self.remove(unpacked_dir_info)

try:
dir_info = self.get_dir_cache(checksum)
self._create_unpacked_dir(checksum, dir_info, unpacked_dir_info)
except DvcException:
logger.warning(f"Could not create '{unpacked_dir_info}'")

self.remove(unpacked_dir_info)

def _create_unpacked_dir(self, checksum, dir_info, unpacked_dir_info):
self.makedirs(unpacked_dir_info)

for entry in Tqdm(dir_info, desc="Creating unpacked dir", unit="file"):
entry_cache_info = self.checksum_to_path_info(
entry[self.PARAM_CHECKSUM]
)
relative_path = entry[self.PARAM_RELPATH]
# In shared cache mode some cache files might not be owned by the
# user, so we need to use symlinks because, unless
# /proc/sys/fs/protected_hardlinks is disabled, the user is not
# allowed to create hardlinks to files that he doesn't own.
link_types = ["hardlink", "symlink"]
self._link(
entry_cache_info, unpacked_dir_info / relative_path, link_types
)

self.state.save(unpacked_dir_info, checksum)

def _changed_unpacked_dir(self, checksum):
status_unpacked_dir_info = self._get_unpacked_dir_path_info(checksum)

return not self.state.get(status_unpacked_dir_info)

def _get_unpacked_dir_names(self, checksums):
unpacked = set()
for c in checksums:
if self.is_dir_checksum(c):
unpacked.add(c + self.UNPACKED_DIR_SUFFIX)
return unpacked

def is_protected(self, path_info):
if not self.exists(path_info):
try:
mode = os.stat(path_info).st_mode
except FileNotFoundError:
return False

mode = os.stat(path_info).st_mode

return stat.S_IMODE(mode) == self.CACHE_MODE
3 changes: 2 additions & 1 deletion dvc/scm/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ def walk(self, top, topdown=True):
def walk_files(self, top):
for root, _, files in self.walk(top):
for file in files:
yield os.path.join(root, file)
# NOTE: os.path.join is ~5.5 times slower
yield f"{root}{os.sep}{file}"


class WorkingTree(BaseTree):
Expand Down
65 changes: 0 additions & 65 deletions tests/func/remote/test_local.py

This file was deleted.

2 changes: 2 additions & 0 deletions tests/func/test_gc.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,8 @@ def test_gc_no_unpacked_dir(tmp_dir, dvc):
dir_stages[0].outs[0].cache_path + LocalRemote.UNPACKED_DIR_SUFFIX
)

# older (pre 1.0) versions of dvc used to generate this dir
shutil.copytree("dir", unpackeddir)
assert os.path.exists(unpackeddir)

dvc.gc(force=True, workspace=True)
Expand Down
1 change: 0 additions & 1 deletion tests/unit/test_ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ def test_ignore_from_file_should_filter_dirs_and_files():
),
("dont_ignore.txt", ["dont_ignore"], False),
("dont_ignore.txt", ["dont*", "!dont_ignore.txt"], False),
("../../../something.txt", ["**/something.txt"], False),
],
)
def test_match_ignore_from_file(
Expand Down

0 comments on commit d77af81

Please sign in to comment.