diff --git a/dvc/api.py b/dvc/api.py index 4b5e846878..9c31c4aec2 100644 --- a/dvc/api.py +++ b/dvc/api.py @@ -30,7 +30,7 @@ def get_url(path, repo=None, rev=None, remote=None): raise UrlNotDvcRepoError(_repo.url) out = _repo.find_out_by_relpath(path) remote_obj = _repo.cloud.get_remote(remote) - return str(remote_obj.checksum_to_path_info(out.checksum)) + return str(remote_obj.hash_to_path_info(out.checksum)) def open(path, repo=None, rev=None, remote=None, mode="r", encoding=None): diff --git a/dvc/data_cloud.py b/dvc/data_cloud.py index f6afd28b51..68c88e5b00 100644 --- a/dvc/data_cloud.py +++ b/dvc/data_cloud.py @@ -92,7 +92,7 @@ def pull( def _save_pulled_checksums(self, cache): for checksum in cache.scheme_keys("local"): - cache_file = self.repo.cache.local.checksum_to_path_info(checksum) + cache_file = self.repo.cache.local.hash_to_path_info(checksum) if self.repo.cache.local.tree.exists(cache_file): # We can safely save here, as existing corrupted files will # be removed upon status, while files corrupted during diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index afa1f3fbe7..8566c080b9 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -64,8 +64,8 @@ def _get_checksum(self, locked=True): # We are polluting our repo cache with some dir listing here if tree.isdir(path): - return self.repo.cache.local.get_checksum(path, tree) - return tree.get_file_checksum(path) + return self.repo.cache.local.get_hash(path, tree) + return tree.get_file_hash(path) def status(self): current_checksum = self._get_checksum(locked=True) diff --git a/dvc/output/base.py b/dvc/output/base.py index cc95fa655f..37d30a210c 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -163,7 +163,7 @@ def supported(cls, url): @property def cache_path(self): - return self.cache.checksum_to_path_info(self.checksum).url + return self.cache.hash_to_path_info(self.checksum).url @property def checksum_type(self): @@ -178,11 +178,11 @@ def checksum(self, checksum): self.info[self.remote.tree.PARAM_CHECKSUM] = checksum def get_checksum(self): - return self.remote.get_checksum(self.path_info) + return self.remote.get_hash(self.path_info) @property def is_dir_checksum(self): - return self.remote.is_dir_checksum(self.checksum) + return self.remote.is_dir_hash(self.checksum) @property def exists(self): diff --git a/dvc/remote/azure.py b/dvc/remote/azure.py index c162072a88..4ce0cc3f27 100644 --- a/dvc/remote/azure.py +++ b/dvc/remote/azure.py @@ -113,7 +113,7 @@ def remove(self, path_info): logger.debug(f"Removing {path_info}") self.blob_service.delete_blob(path_info.bucket, path_info.path) - def get_file_checksum(self, path_info): + def get_file_hash(self, path_info): return self.get_etag(path_info) def _upload( diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 71b8100f7a..8affd571a1 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -65,9 +65,9 @@ class RemoteMissingDepsError(DvcException): class DirCacheError(DvcException): - def __init__(self, checksum): + def __init__(self, hash_): super().__init__( - f"Failed to load dir cache for hash value: '{checksum}'." + f"Failed to load dir cache for hash value: '{hash_}'." ) @@ -90,7 +90,7 @@ class BaseRemoteTree: PARAM_RELPATH = "relpath" CHECKSUM_DIR_SUFFIX = ".dir" - CHECKSUM_JOBS = max(1, min(4, cpu_count() // 2)) + HASH_JOBS = max(1, min(4, cpu_count() // 2)) DEFAULT_VERIFY = False LIST_OBJECT_PAGE_SIZE = 1000 TRAVERSE_WEIGHT_MULTIPLIER = 5 @@ -113,10 +113,10 @@ def __init__(self, repo, config): shared = config.get("shared") self._file_mode, self._dir_mode = self.SHARED_MODE_MAP[shared] - self.checksum_jobs = ( - config.get("checksum_jobs") - or (self.repo and self.repo.config["core"].get("checksum_jobs")) - or self.CHECKSUM_JOBS + self.hash_jobs = ( + config.get("hash_jobs") + or (self.repo and self.repo.config["core"].get("hash_jobs")) + or self.HASH_JOBS ) self.verify = config.get("verify", self.DEFAULT_VERIFY) @@ -255,12 +255,12 @@ def unprotect(path_info): pass @classmethod - def is_dir_checksum(cls, checksum): - if not checksum: + def is_dir_hash(cls, hash_): + if not hash_: return False - return checksum.endswith(cls.CHECKSUM_DIR_SUFFIX) + return hash_.endswith(cls.CHECKSUM_DIR_SUFFIX) - def get_checksum(self, path_info, tree=None, **kwargs): + def get_hash(self, path_info, tree=None, **kwargs): assert isinstance(path_info, str) or path_info.scheme == self.scheme if not tree: @@ -270,47 +270,47 @@ def get_checksum(self, path_info, tree=None, **kwargs): return None if tree == self: - checksum = self.state.get(path_info) + hash_ = self.state.get(path_info) else: - checksum = None + hash_ = None - # If we have dir checksum in state db, but dir cache file is lost, - # then we need to recollect the dir via .get_dir_checksum() call below, + # If we have dir hash in state db, but dir cache file is lost, + # then we need to recollect the dir via .get_dir_hash() call below, # see https://github.com/iterative/dvc/issues/2219 for context if ( - checksum - and self.is_dir_checksum(checksum) - and not tree.exists(self.cache.checksum_to_path_info(checksum)) + hash_ + and self.is_dir_hash(hash_) + and not tree.exists(self.cache.hash_to_path_info(hash_)) ): - checksum = None + hash_ = None - if checksum: - return checksum + if hash_: + return hash_ if tree.isdir(path_info): - checksum = self.get_dir_checksum(path_info, tree, **kwargs) + hash_ = self.get_dir_hash(path_info, tree, **kwargs) else: - checksum = tree.get_file_checksum(path_info) + hash_ = tree.get_file_hash(path_info) - if checksum and self.exists(path_info): - self.state.save(path_info, checksum) + if hash_ and self.exists(path_info): + self.state.save(path_info, hash_) - return checksum + return hash_ - def get_file_checksum(self, path_info): + def get_file_hash(self, path_info): raise NotImplementedError - def get_dir_checksum(self, path_info, tree, **kwargs): + def get_dir_hash(self, path_info, tree, **kwargs): if not self.cache: raise RemoteCacheRequiredError(path_info) dir_info = self._collect_dir(path_info, tree, **kwargs) return self._save_dir_info(dir_info, path_info) - def checksum_to_path_info(self, checksum): - return self.path_info / checksum[0:2] / checksum[2:] + def hash_to_path_info(self, hash_): + return self.path_info / hash_[0:2] / hash_[2:] - def path_to_checksum(self, path): + def path_to_hash(self, path): parts = self.PATH_CLS(path).parts[-2:] if not (len(parts) == 2 and parts[0] and len(parts[0]) == 2): @@ -320,25 +320,21 @@ def path_to_checksum(self, path): def save_info(self, path_info, tree=None, **kwargs): return { - self.PARAM_CHECKSUM: self.get_checksum( - path_info, tree=tree, **kwargs - ) + self.PARAM_CHECKSUM: self.get_hash(path_info, tree=tree, **kwargs) } - def _calculate_checksums(self, file_infos, tree): + def _calculate_hashes(self, file_infos, tree): file_infos = list(file_infos) with Tqdm( total=len(file_infos), unit="md5", desc="Computing file/dir hashes (only done once)", ) as pbar: - worker = pbar.wrap_fn(tree.get_file_checksum) - with ThreadPoolExecutor( - max_workers=self.checksum_jobs - ) as executor: + worker = pbar.wrap_fn(tree.get_file_hash) + with ThreadPoolExecutor(max_workers=self.hash_jobs) as executor: tasks = executor.map(worker, file_infos) - checksums = dict(zip(file_infos, tasks)) - return checksums + hashes = dict(zip(file_infos, tasks)) + return hashes def _collect_dir(self, path_info, tree, **kwargs): file_infos = set() @@ -349,17 +345,15 @@ def _collect_dir(self, path_info, tree, **kwargs): file_infos.add(fname) - checksums = {fi: self.state.get(fi) for fi in file_infos} - not_in_state = { - fi for fi, checksum in checksums.items() if checksum is None - } + hashes = {fi: self.state.get(fi) for fi in file_infos} + not_in_state = {fi for fi, hash_ in hashes.items() if hash_ is None} - new_checksums = self._calculate_checksums(not_in_state, tree) - checksums.update(new_checksums) + new_hashes = self._calculate_hashes(not_in_state, tree) + hashes.update(new_hashes) result = [ { - self.PARAM_CHECKSUM: checksums[fi], + self.PARAM_CHECKSUM: hashes[fi], # NOTE: this is lossy transformation: # "hey\there" -> "hey/there" # "hey/there" -> "hey/there" @@ -377,21 +371,21 @@ def _collect_dir(self, path_info, tree, **kwargs): return sorted(result, key=itemgetter(self.PARAM_RELPATH)) def _save_dir_info(self, dir_info, path_info): - checksum, tmp_info = self._get_dir_info_checksum(dir_info) - new_info = self.cache.checksum_to_path_info(checksum) - if self.cache.changed_cache_file(checksum): + hash_, tmp_info = self._get_dir_info_hash(dir_info) + new_info = self.cache.hash_to_path_info(hash_) + if self.cache.changed_cache_file(hash_): self.cache.tree.makedirs(new_info.parent) self.cache.tree.move( tmp_info, new_info, mode=self.cache.CACHE_MODE ) if self.exists(path_info): - self.state.save(path_info, checksum) - self.state.save(new_info, checksum) + self.state.save(path_info, hash_) + self.state.save(new_info, hash_) - return checksum + return hash_ - def _get_dir_info_checksum(self, dir_info): + def _get_dir_info_hash(self, dir_info): tmp = tempfile.NamedTemporaryFile(delete=False).name with open(tmp, "w+") as fobj: json.dump(dir_info, fobj, sort_keys=True) @@ -401,8 +395,8 @@ def _get_dir_info_checksum(self, dir_info): to_info = tree.path_info / tmp_fname("") tree.upload(from_info, to_info, no_progress_bar=True) - checksum = tree.get_file_checksum(to_info) + self.CHECKSUM_DIR_SUFFIX - return checksum, to_info + hash_ = tree.get_file_hash(to_info) + self.CHECKSUM_DIR_SUFFIX + return hash_, to_info def upload(self, from_info, to_info, name=None, no_progress_bar=False): if not hasattr(self, "_upload"): @@ -530,73 +524,71 @@ def list_paths(self, prefix=None, progress_callback=None): else: yield from self.walk_files(path_info) - def list_checksums(self, prefix=None, progress_callback=None): - """Iterate over checksums in this tree. + def list_hashes(self, prefix=None, progress_callback=None): + """Iterate over hashes in this tree. - If `prefix` is specified, only checksums which begin with `prefix` + If `prefix` is specified, only hashes which begin with `prefix` will be returned. """ for path in self.list_paths(prefix, progress_callback): try: - yield self.path_to_checksum(path) + yield self.path_to_hash(path) except ValueError: logger.debug( "'%s' doesn't look like a cache file, skipping", path ) def all(self, jobs=None, name=None): - """Iterate over all checksums in this tree. + """Iterate over all hashes in this tree. - Checksums will be fetched in parallel threads according to prefix + Hashes will be fetched in parallel threads according to prefix (except for small remotes) and a progress bar will be displayed. """ logger.debug( - "Fetching all checksums from '{}'".format( + "Fetching all hashes from '{}'".format( name if name else "remote cache" ) ) if not self.CAN_TRAVERSE: - return self.list_checksums() + return self.list_hashes() - remote_size, remote_checksums = self.estimate_remote_size(name=name) - return self.list_checksums_traverse( - remote_size, remote_checksums, jobs, name + remote_size, remote_hashes = self.estimate_remote_size(name=name) + return self.list_hashes_traverse( + remote_size, remote_hashes, jobs, name ) - def _checksums_with_limit( - self, limit, prefix=None, progress_callback=None - ): + def _hashes_with_limit(self, limit, prefix=None, progress_callback=None): count = 0 - for checksum in self.list_checksums(prefix, progress_callback): - yield checksum + for hash_ in self.list_hashes(prefix, progress_callback): + yield hash_ count += 1 if count > limit: logger.debug( - "`list_checksums()` returned max '{}' checksums, " + "`list_hashes()` returned max '{}' hashes, " "skipping remaining results".format(limit) ) return - def _max_estimation_size(self, checksums): + def _max_estimation_size(self, hashes): # Max remote size allowed for us to use traverse method return max( self.TRAVERSE_THRESHOLD_SIZE, - len(checksums) + len(hashes) / self.TRAVERSE_WEIGHT_MULTIPLIER * self.LIST_OBJECT_PAGE_SIZE, ) - def estimate_remote_size(self, checksums=None, name=None): + def estimate_remote_size(self, hashes=None, name=None): """Estimate tree size based on number of entries beginning with "00..." prefix. """ prefix = "0" * self.TRAVERSE_PREFIX_LEN total_prefixes = pow(16, self.TRAVERSE_PREFIX_LEN) - if checksums: - max_checksums = self._max_estimation_size(checksums) + if hashes: + max_hashes = self._max_estimation_size(hashes) else: - max_checksums = None + max_hashes = None with Tqdm( desc="Estimating size of " @@ -607,33 +599,33 @@ def estimate_remote_size(self, checksums=None, name=None): def update(n=1): pbar.update(n * total_prefixes) - if max_checksums: - checksums = self._checksums_with_limit( - max_checksums / total_prefixes, prefix, update + if max_hashes: + hashes = self._hashes_with_limit( + max_hashes / total_prefixes, prefix, update ) else: - checksums = self.list_checksums(prefix, update) + hashes = self.list_hashes(prefix, update) - remote_checksums = set(checksums) - if remote_checksums: - remote_size = total_prefixes * len(remote_checksums) + remote_hashes = set(hashes) + if remote_hashes: + remote_size = total_prefixes * len(remote_hashes) else: remote_size = total_prefixes logger.debug(f"Estimated remote size: {remote_size} files") - return remote_size, remote_checksums + return remote_size, remote_hashes - def list_checksums_traverse( - self, remote_size, remote_checksums, jobs=None, name=None + def list_hashes_traverse( + self, remote_size, remote_hashes, jobs=None, name=None ): - """Iterate over all checksums found in this tree. - Checksums are fetched in parallel according to prefix, except in + """Iterate over all hashes found in this tree. + Hashes are fetched in parallel according to prefix, except in cases where the remote size is very small. - All checksums from the remote (including any from the size - estimation step passed via the `remote_checksums` argument) will be + All hashes from the remote (including any from the size + estimation step passed via the `remote_hashes` argument) will be returned. - NOTE: For large remotes the list of checksums will be very + NOTE: For large remotes the list of hashes will be very big(e.g. 100M entries, md5 for each is 32 bytes, so ~3200Mb list) and we don't really need all of it at the same time, so it makes sense to use a generator to gradually iterate over it, without @@ -645,13 +637,13 @@ def list_checksums_traverse( # requests, for small enough remotes it will be faster to fetch # entire cache without splitting it into prefixes. # - # NOTE: this ends up re-fetching checksums that were already + # NOTE: this ends up re-fetching hashes that were already # fetched during remote size estimation traverse_prefixes = [None] initial = 0 else: - yield from remote_checksums - initial = len(remote_checksums) + yield from remote_hashes + initial = len(remote_hashes) traverse_prefixes = [f"{i:02x}" for i in range(1, 256)] if self.TRAVERSE_PREFIX_LEN > 2: traverse_prefixes += [ @@ -668,7 +660,7 @@ def list_checksums_traverse( def list_with_update(prefix): return list( - self.list_checksums( + self.list_hashes( prefix=prefix, progress_callback=pbar.update ) ) @@ -677,17 +669,17 @@ def list_with_update(prefix): in_remote = executor.map(list_with_update, traverse_prefixes,) yield from itertools.chain.from_iterable(in_remote) - def list_checksums_exists(self, checksums, jobs=None, name=None): - """Return list of the specified checksums which exist in this tree. - Checksums will be queried individually. + def list_hashes_exists(self, hashes, jobs=None, name=None): + """Return list of the specified hashes which exist in this tree. + Hashes will be queried individually. """ logger.debug( - "Querying {} checksums via object_exists".format(len(checksums)) + "Querying {} hashes via object_exists".format(len(hashes)) ) with Tqdm( desc="Querying " + ("cache in " + name if name else "remote cache"), - total=len(checksums), + total=len(hashes), unit="file", ) as pbar: @@ -697,12 +689,12 @@ def exists_with_progress(path_info): return ret with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor: - path_infos = map(self.checksum_to_path_info, checksums) + path_infos = map(self.hash_to_path_info, hashes) in_remote = executor.map(exists_with_progress, path_infos) - ret = list(itertools.compress(checksums, in_remote)) + ret = list(itertools.compress(hashes, in_remote)) return ret - def _remove_unpacked_dir(self, checksum): + def _remove_unpacked_dir(self, hash_): pass @@ -747,17 +739,17 @@ def cache(self): def scheme(self): return self.tree.scheme - def is_dir_checksum(self, checksum): - return self.tree.is_dir_checksum(checksum) + def is_dir_hash(self, hash_): + return self.tree.is_dir_hash(hash_) - def get_checksum(self, path_info, **kwargs): - return self.tree.get_checksum(path_info, **kwargs) + def get_hash(self, path_info, **kwargs): + return self.tree.get_hash(path_info, **kwargs) - def checksum_to_path_info(self, checksum): - return self.tree.checksum_to_path_info(checksum) + def hash_to_path_info(self, hash_): + return self.tree.hash_to_path_info(hash_) - def path_to_checksum(self, path): - return self.tree.path_to_checksum(path) + def path_to_hash(self, path): + return self.tree.path_to_hash(path) def save_info(self, path_info, **kwargs): return self.tree.save_info(path_info, **kwargs) @@ -765,19 +757,19 @@ def save_info(self, path_info, **kwargs): def open(self, *args, **kwargs): return self.tree.open(*args, **kwargs) - def checksums_exist(self, checksums, jobs=None, name=None): - """Check if the given checksums are stored in the remote. + def hashes_exist(self, hashes, jobs=None, name=None): + """Check if the given hashes are stored in the remote. There are two ways of performing this check: - Traverse method: Get a list of all the files in the remote (traversing the cache directory) and compare it with - the given checksums. Cache entries will be retrieved in parallel + the given hashes. Cache entries will be retrieved in parallel threads according to prefix (i.e. entries starting with, "00...", "01...", and so on) and a progress bar will be displayed. - - Exists method: For each given checksum, run the `exists` - method and filter the checksums that aren't on the remote. + - Exists method: For each given hash, run the `exists` + method and filter the hashes that aren't on the remote. This is done in parallel threads. It also shows a progress bar when performing the check. @@ -785,80 +777,72 @@ def checksums_exist(self, checksums, jobs=None, name=None): take much shorter time to just retrieve everything they have under a certain prefix (e.g. s3, gs, ssh, hdfs). Other remotes that can check if particular file exists much quicker, use their own - implementation of checksums_exist (see ssh, local). + implementation of hashes_exist (see ssh, local). Which method to use will be automatically determined after estimating the size of the remote cache, and comparing the estimated size with - len(checksums). To estimate the size of the remote cache, we fetch + len(hashes). To estimate the size of the remote cache, we fetch a small subset of cache entries (i.e. entries starting with "00..."). Based on the number of entries in that subset, the size of the full cache can be estimated, since the cache is evenly distributed according - to checksum. + to hash. Returns: - A list with checksums that were found in the remote + A list with hashes that were found in the remote """ # Remotes which do not use traverse prefix should override - # checksums_exist() (see ssh, local) + # hashes_exist() (see ssh, local) assert self.tree.TRAVERSE_PREFIX_LEN >= 2 - checksums = set(checksums) - indexed_checksums = set(self.index.intersection(checksums)) - checksums -= indexed_checksums - logger.debug( - "Matched '{}' indexed checksums".format(len(indexed_checksums)) - ) - if not checksums: - return indexed_checksums + hashes = set(hashes) + indexed_hashes = set(self.index.intersection(hashes)) + hashes -= indexed_hashes + logger.debug("Matched '{}' indexed hashes".format(len(indexed_hashes))) + if not hashes: + return indexed_hashes - if len(checksums) == 1 or not self.tree.CAN_TRAVERSE: - remote_checksums = self.tree.list_checksums_exists( - checksums, jobs, name - ) - return list(indexed_checksums) + remote_checksums + if len(hashes) == 1 or not self.tree.CAN_TRAVERSE: + remote_hashes = self.tree.list_hashes_exists(hashes, jobs, name) + return list(indexed_hashes) + remote_hashes # Max remote size allowed for us to use traverse method - remote_size, remote_checksums = self.tree.estimate_remote_size( - checksums, name + remote_size, remote_hashes = self.tree.estimate_remote_size( + hashes, name ) traverse_pages = remote_size / self.tree.LIST_OBJECT_PAGE_SIZE # For sufficiently large remotes, traverse must be weighted to account # for performance overhead from large lists/sets. # From testing with S3, for remotes with 1M+ files, object_exists is - # faster until len(checksums) is at least 10k~100k + # faster until len(hashes) is at least 10k~100k if remote_size > self.tree.TRAVERSE_THRESHOLD_SIZE: traverse_weight = ( traverse_pages * self.tree.TRAVERSE_WEIGHT_MULTIPLIER ) else: traverse_weight = traverse_pages - if len(checksums) < traverse_weight: + if len(hashes) < traverse_weight: logger.debug( - "Large remote ('{}' checksums < '{}' traverse weight), " - "using object_exists for remaining checksums".format( - len(checksums), traverse_weight + "Large remote ('{}' hashes < '{}' traverse weight), " + "using object_exists for remaining hashes".format( + len(hashes), traverse_weight ) ) return ( - list(indexed_checksums) - + list(checksums & remote_checksums) - + self.tree.list_checksums_exists( - checksums - remote_checksums, jobs, name + list(indexed_hashes) + + list(hashes & remote_hashes) + + self.tree.list_hashes_exists( + hashes - remote_hashes, jobs, name ) ) - logger.debug( - "Querying '{}' checksums via traverse".format(len(checksums)) - ) - remote_checksums = set( - self.tree.list_checksums_traverse( - remote_size, remote_checksums, jobs, name + logger.debug("Querying '{}' hashes via traverse".format(len(hashes))) + remote_hashes = set( + self.tree.list_hashes_traverse( + remote_size, remote_hashes, jobs, name ) ) - return list(indexed_checksums) + list( - checksums & set(remote_checksums) - ) + return list(indexed_hashes) + list(hashes & set(remote_hashes)) @classmethod @index_locked @@ -870,18 +854,18 @@ def gc(cls, named_cache, remote, jobs=None): used.update(named_cache.scheme_keys(tree.scheme)) removed = False - # checksums must be sorted to ensure we always remove .dir files first - for checksum in sorted( + # hashes must be sorted to ensure we always remove .dir files first + for hash_ in sorted( tree.all(jobs, str(tree.path_info)), - key=tree.is_dir_checksum, + key=tree.is_dir_hash, reverse=True, ): - if checksum in used: + if hash_ in used: continue - path_info = tree.checksum_to_path_info(checksum) - if tree.is_dir_checksum(checksum): + path_info = tree.hash_to_path_info(hash_) + if tree.is_dir_hash(hash_): # backward compatibility - tree._remove_unpacked_dir(checksum) + tree._remove_unpacked_dir(hash_) tree.remove(path_info) removed = True @@ -925,43 +909,43 @@ def state(self): def open(self, *args, **kwargs): return self.tree.open(*args, **kwargs) - def is_dir_checksum(self, checksum): - return self.tree.is_dir_checksum(checksum) + def is_dir_hash(self, hash_): + return self.tree.is_dir_hash(hash_) - def get_checksum(self, path_info, **kwargs): - return self.tree.get_checksum(path_info, **kwargs) + def get_hash(self, path_info, **kwargs): + return self.tree.get_hash(path_info, **kwargs) # Override to return path as a string instead of PathInfo for clouds # which support string paths (see local) - def checksum_to_path(self, checksum): - return self.checksum_to_path_info(checksum) + def hash_to_path(self, hash_): + return self.hash_to_path_info(hash_) - def checksum_to_path_info(self, checksum): - return self.tree.checksum_to_path_info(checksum) + def hash_to_path_info(self, hash_): + return self.tree.hash_to_path_info(hash_) - def get_dir_cache(self, checksum): - assert checksum + def get_dir_cache(self, hash_): + assert hash_ - dir_info = self._dir_info.get(checksum) + dir_info = self._dir_info.get(hash_) if dir_info: return dir_info try: - dir_info = self.load_dir_cache(checksum) + dir_info = self.load_dir_cache(hash_) except DirCacheError: dir_info = [] - self._dir_info[checksum] = dir_info + self._dir_info[hash_] = dir_info return dir_info - def load_dir_cache(self, checksum): - path_info = self.checksum_to_path_info(checksum) + def load_dir_cache(self, hash_): + path_info = self.hash_to_path_info(hash_) try: with self.cache.open(path_info, "r") as fobj: d = json.load(fobj) except (ValueError, FileNotFoundError) as exc: - raise DirCacheError(checksum) from exc + raise DirCacheError(hash_) from exc if not isinstance(d, list): logger.error( @@ -981,7 +965,7 @@ def load_dir_cache(self, checksum): return d - def changed(self, path_info, checksum_info): + def changed(self, path_info, hash_info): """Checks if data has changed. A file is considered changed if: @@ -992,36 +976,34 @@ def changed(self, path_info, checksum_info): Args: path_info: dict with path information. - checksum: expected hash value for this data. + hash: expected hash value for this data. Returns: bool: True if data has changed, False otherwise. """ logger.debug( - "checking if '%s'('%s') has changed.", path_info, checksum_info + "checking if '%s'('%s') has changed.", path_info, hash_info ) if not self.tree.exists(path_info): logger.debug("'%s' doesn't exist.", path_info) return True - checksum = checksum_info.get(self.tree.PARAM_CHECKSUM) - if checksum is None: + hash_ = hash_info.get(self.tree.PARAM_CHECKSUM) + if hash_ is None: logger.debug("hash value for '%s' is missing.", path_info) return True - if self.changed_cache(checksum): - logger.debug( - "cache for '%s'('%s') has changed.", path_info, checksum - ) + if self.changed_cache(hash_): + logger.debug("cache for '%s'('%s') has changed.", path_info, hash_) return True - actual = self.get_checksum(path_info) - if checksum != actual: + actual = self.get_hash(path_info) + if hash_ != actual: logger.debug( "hash value '%s' for '%s' has changed (actual '%s').", - checksum, + hash_, actual, path_info, ) @@ -1078,12 +1060,12 @@ def _do_link(self, from_info, to_info, link_method): "Created '%s': %s -> %s", self.cache_types[0], from_info, to_info, ) - def _save_file(self, path_info, tree, checksum, save_link=True, **kwargs): - assert checksum + def _save_file(self, path_info, tree, hash_, save_link=True, **kwargs): + assert hash_ - cache_info = self.checksum_to_path_info(checksum) + cache_info = self.hash_to_path_info(hash_) if tree == self.tree: - if self.changed_cache(checksum): + if self.changed_cache(hash_): self.tree.move(path_info, cache_info, mode=self.CACHE_MODE) self.link(cache_info, path_info) elif self.tree.iscopy(path_info) and self._cache_is_copy( @@ -1100,9 +1082,9 @@ def _save_file(self, path_info, tree, checksum, save_link=True, **kwargs): # we need to update path and cache, since in case of reflink, # or copy cache type moving original file results in updates on # next executed command, which causes md5 recalculation - self.state.save(path_info, checksum) + self.state.save(path_info, hash_) else: - if self.changed_cache(checksum): + if self.changed_cache(hash_): with tree.open(path_info, mode="rb") as fobj: # if tree has fetch enabled, DVC out will be fetched on # open and we do not need to read/copy any data @@ -1114,8 +1096,8 @@ def _save_file(self, path_info, tree, checksum, save_link=True, **kwargs): if callback: callback(1) - self.state.save(cache_info, checksum) - return {self.tree.PARAM_CHECKSUM: checksum} + self.state.save(cache_info, hash_) + return {self.tree.PARAM_CHECKSUM: hash_} def _cache_is_copy(self, path_info): """Checks whether cache uses copies.""" @@ -1139,80 +1121,75 @@ def _cache_is_copy(self, path_info): self.cache_type_confirmed = True return self.cache_types[0] == "copy" - def _save_dir(self, path_info, tree, checksum, save_link=True, **kwargs): - dir_info = self.get_dir_cache(checksum) + def _save_dir(self, path_info, tree, hash_, save_link=True, **kwargs): + dir_info = self.get_dir_cache(hash_) for entry in Tqdm( dir_info, desc="Saving " + path_info.name, unit="file" ): entry_info = path_info / entry[self.tree.PARAM_RELPATH] - entry_checksum = entry[self.tree.PARAM_CHECKSUM] + entry_hash = entry[self.tree.PARAM_CHECKSUM] self._save_file( - entry_info, tree, entry_checksum, save_link=False, **kwargs + entry_info, tree, entry_hash, save_link=False, **kwargs ) if save_link: self.state.save_link(path_info) if self.tree.exists(path_info): - self.state.save(path_info, checksum) + self.state.save(path_info, hash_) - cache_info = self.checksum_to_path_info(checksum) - self.state.save(cache_info, checksum) - return {self.tree.PARAM_CHECKSUM: checksum} + cache_info = self.hash_to_path_info(hash_) + self.state.save(cache_info, hash_) + return {self.tree.PARAM_CHECKSUM: hash_} - def save(self, path_info, tree, checksum_info, save_link=True, **kwargs): + def save(self, path_info, tree, hash_info, save_link=True, **kwargs): if path_info.scheme != self.scheme: raise RemoteActionNotImplemented( f"save {path_info.scheme} -> {self.scheme}", self.scheme, ) - if not checksum_info: - checksum_info = self.tree.save_info(path_info, tree=tree, **kwargs) - checksum = checksum_info[self.tree.PARAM_CHECKSUM] - return self._save(path_info, tree, checksum, save_link, **kwargs) + if not hash_info: + hash_info = self.tree.save_info(path_info, tree=tree, **kwargs) + hash_ = hash_info[self.tree.PARAM_CHECKSUM] + return self._save(path_info, tree, hash_, save_link, **kwargs) - def _save(self, path_info, tree, checksum, save_link=True, **kwargs): - to_info = self.checksum_to_path_info(checksum) + def _save(self, path_info, tree, hash_, save_link=True, **kwargs): + to_info = self.hash_to_path_info(hash_) logger.debug("Saving '%s' to '%s'.", path_info, to_info) if tree.isdir(path_info): - return self._save_dir( - path_info, tree, checksum, save_link, **kwargs - ) - return self._save_file(path_info, tree, checksum, save_link, **kwargs) + return self._save_dir(path_info, tree, hash_, save_link, **kwargs) + return self._save_file(path_info, tree, hash_, save_link, **kwargs) - def changed_cache_file(self, checksum): - """Compare the given checksum with the (corresponding) actual one. + def changed_cache_file(self, hash_): + """Compare the given hash with the (corresponding) actual one. - - Use `State` as a cache for computed checksums + - Use `State` as a cache for computed hashes + The entries are invalidated by taking into account the following: * mtime * inode * size - * checksum + * hash - - Remove the file from cache if it doesn't match the actual checksum + - Remove the file from cache if it doesn't match the actual hash """ # Prefer string path over PathInfo when possible due to performance - cache_info = self.checksum_to_path(checksum) + cache_info = self.hash_to_path(hash_) if self.tree.is_protected(cache_info): logger.debug( "Assuming '%s' is unchanged since it is read-only", cache_info ) return False - actual = self.get_checksum(cache_info) + actual = self.get_hash(cache_info) logger.debug( - "cache '%s' expected '%s' actual '%s'", - cache_info, - checksum, - actual, + "cache '%s' expected '%s' actual '%s'", cache_info, hash_, actual, ) - if not checksum or not actual: + if not hash_ or not actual: return True - if actual.split(".")[0] == checksum.split(".")[0]: + if actual.split(".")[0] == hash_.split(".")[0]: # making cache file read-only so we don't need to check it # next time self.tree.protect(cache_info) @@ -1224,32 +1201,32 @@ def changed_cache_file(self, checksum): return True - def _changed_dir_cache(self, checksum, path_info=None, filter_info=None): - if self.changed_cache_file(checksum): + def _changed_dir_cache(self, hash_, path_info=None, filter_info=None): + if self.changed_cache_file(hash_): return True - for entry in self.get_dir_cache(checksum): - entry_checksum = entry[self.tree.PARAM_CHECKSUM] + for entry in self.get_dir_cache(hash_): + entry_hash = entry[self.tree.PARAM_CHECKSUM] if path_info and filter_info: entry_info = path_info / entry[self.tree.PARAM_RELPATH] if not entry_info.isin_or_eq(filter_info): continue - if self.changed_cache_file(entry_checksum): + if self.changed_cache_file(entry_hash): return True return False - def changed_cache(self, checksum, path_info=None, filter_info=None): - if self.is_dir_checksum(checksum): + def changed_cache(self, hash_, path_info=None, filter_info=None): + if self.is_dir_hash(hash_): return self._changed_dir_cache( - checksum, path_info=path_info, filter_info=filter_info + hash_, path_info=path_info, filter_info=filter_info ) - return self.changed_cache_file(checksum) + return self.changed_cache_file(hash_) def already_cached(self, path_info): - current = self.get_checksum(path_info) + current = self.get_hash(path_info) if not current: return False @@ -1272,11 +1249,11 @@ def safe_remove(self, path_info, force=False): self.tree.remove(path_info) def _checkout_file( - self, path_info, checksum, force, progress_callback=None, relink=False + self, path_info, hash_, force, progress_callback=None, relink=False ): """The file is changed we need to checkout a new copy""" added, modified = True, False - cache_info = self.checksum_to_path_info(checksum) + cache_info = self.hash_to_path_info(hash_) if self.tree.exists(path_info): logger.debug("data '%s' will be replaced.", path_info) self.safe_remove(path_info, force=force) @@ -1284,7 +1261,7 @@ def _checkout_file( self.link(cache_info, path_info) self.state.save_link(path_info) - self.state.save(path_info, checksum) + self.state.save(path_info, hash_) if progress_callback: progress_callback(str(path_info)) @@ -1293,7 +1270,7 @@ def _checkout_file( def _checkout_dir( self, path_info, - checksum, + hash_, force, progress_callback=None, relink=False, @@ -1306,25 +1283,25 @@ def _checkout_dir( added = True self.tree.makedirs(path_info) - dir_info = self.get_dir_cache(checksum) + dir_info = self.get_dir_cache(hash_) logger.debug("Linking directory '%s'.", path_info) for entry in dir_info: relative_path = entry[self.tree.PARAM_RELPATH] - entry_checksum = entry[self.tree.PARAM_CHECKSUM] - entry_cache_info = self.checksum_to_path_info(entry_checksum) + entry_hash = entry[self.tree.PARAM_CHECKSUM] + entry_cache_info = self.hash_to_path_info(entry_hash) entry_info = path_info / relative_path if filter_info and not entry_info.isin_or_eq(filter_info): continue - entry_checksum_info = {self.tree.PARAM_CHECKSUM: entry_checksum} - if relink or self.changed(entry_info, entry_checksum_info): + entry_hash_info = {self.tree.PARAM_CHECKSUM: entry_hash} + if relink or self.changed(entry_info, entry_hash_info): modified = True self.safe_remove(entry_info, force=force) self.link(entry_cache_info, entry_info) - self.state.save(entry_info, entry_checksum) + self.state.save(entry_info, entry_hash) if progress_callback: progress_callback(str(entry_info)) @@ -1334,7 +1311,7 @@ def _checkout_dir( ) self.state.save_link(path_info) - self.state.save(path_info, checksum) + self.state.save(path_info, hash_) # relink is not modified, assume it as nochange return added, not added and modified and not relink @@ -1354,7 +1331,7 @@ def _remove_redundant_files(self, path_info, dir_info, force): def checkout( self, path_info, - checksum_info, + hash_info, force=False, progress_callback=None, relink=False, @@ -1363,10 +1340,10 @@ def checkout( if path_info.scheme not in ["local", self.scheme]: raise NotImplementedError - checksum = checksum_info.get(self.tree.PARAM_CHECKSUM) + hash_ = hash_info.get(self.tree.PARAM_CHECKSUM) failed = None skip = False - if not checksum: + if not hash_: logger.warning( "No file hash info found for '%s'. " "It won't be created.", path_info, @@ -1374,16 +1351,16 @@ def checkout( self.safe_remove(path_info, force=force) failed = path_info - elif not relink and not self.changed(path_info, checksum_info): + elif not relink and not self.changed(path_info, hash_info): logger.debug("Data '%s' didn't change.", path_info) skip = True elif self.changed_cache( - checksum, path_info=path_info, filter_info=filter_info + hash_, path_info=path_info, filter_info=filter_info ): logger.warning( "Cache '%s' not found. File '%s' won't be created.", - checksum, + hash_, path_info, ) self.safe_remove(path_info, force=force) @@ -1393,51 +1370,49 @@ def checkout( if progress_callback: progress_callback( str(path_info), - self.get_files_number( - self.path_info, checksum, filter_info - ), + self.get_files_number(self.path_info, hash_, filter_info), ) if failed: raise CheckoutError([failed]) return - logger.debug("Checking out '%s' with cache '%s'.", path_info, checksum) + logger.debug("Checking out '%s' with cache '%s'.", path_info, hash_) return self._checkout( - path_info, checksum, force, progress_callback, relink, filter_info, + path_info, hash_, force, progress_callback, relink, filter_info, ) def _checkout( self, path_info, - checksum, + hash_, force=False, progress_callback=None, relink=False, filter_info=None, ): - if not self.is_dir_checksum(checksum): + if not self.is_dir_hash(hash_): return self._checkout_file( - path_info, checksum, force, progress_callback, relink + path_info, hash_, force, progress_callback, relink ) return self._checkout_dir( - path_info, checksum, force, progress_callback, relink, filter_info + path_info, hash_, force, progress_callback, relink, filter_info ) - def get_files_number(self, path_info, checksum, filter_info): + def get_files_number(self, path_info, hash_, filter_info): from funcy.py3 import ilen - if not checksum: + if not hash_: return 0 - if not self.is_dir_checksum(checksum): + if not self.is_dir_hash(hash_): return 1 if not filter_info: - return len(self.get_dir_cache(checksum)) + return len(self.get_dir_cache(hash_)) return ilen( filter_info.isin_or_eq(path_info / entry[self.tree.PARAM_CHECKSUM]) - for entry in self.get_dir_cache(checksum) + for entry in self.get_dir_cache(hash_) ) diff --git a/dvc/remote/gdrive.py b/dvc/remote/gdrive.py index fa7194d4e6..874622f404 100644 --- a/dvc/remote/gdrive.py +++ b/dvc/remote/gdrive.py @@ -564,7 +564,7 @@ def remove(self, path_info): item_id = self._get_item_id(path_info) self.gdrive_delete_file(item_id) - def get_file_checksum(self, path_info): + def get_file_hash(self, path_info): raise NotImplementedError def _upload(self, from_file, to_info, name=None, no_progress_bar=False): diff --git a/dvc/remote/gs.py b/dvc/remote/gs.py index 5796403102..0138de6270 100644 --- a/dvc/remote/gs.py +++ b/dvc/remote/gs.py @@ -160,7 +160,7 @@ def copy(self, from_info, to_info): to_bucket = self.gs.bucket(to_info.bucket) from_bucket.copy_blob(blob, to_bucket, new_name=to_info.path) - def get_file_checksum(self, path_info): + def get_file_hash(self, path_info): import base64 import codecs diff --git a/dvc/remote/hdfs.py b/dvc/remote/hdfs.py index fa3f62e460..e9cfe42e8c 100644 --- a/dvc/remote/hdfs.py +++ b/dvc/remote/hdfs.py @@ -159,7 +159,7 @@ def _group(regex, s, gname): assert match is not None return match.group(gname) - def get_file_checksum(self, path_info): + def get_file_hash(self, path_info): # NOTE: pyarrow doesn't support checksum, so we need to use hadoop regex = r".*\t.*\t(?P.*)" stdout = self.hadoop_fs( diff --git a/dvc/remote/http.py b/dvc/remote/http.py index b28e7670df..7affaa780f 100644 --- a/dvc/remote/http.py +++ b/dvc/remote/http.py @@ -124,7 +124,7 @@ def request(self, method, url, **kwargs): def exists(self, path_info): return bool(self.request("HEAD", path_info.url)) - def get_file_checksum(self, path_info): + def get_file_hash(self, path_info): url = path_info.url headers = self.request("HEAD", url).headers etag = headers.get("ETag") or headers.get("Content-MD5") diff --git a/dvc/remote/index.py b/dvc/remote/index.py index 2bd191e98e..aefbe9d8fd 100644 --- a/dvc/remote/index.py +++ b/dvc/remote/index.py @@ -25,15 +25,15 @@ def __exit__(self, typ, value, tbck): def __iter__(self): return iter([]) - def __contains__(self, checksum): + def __contains__(self, hash_): return False @staticmethod - def checksums(): + def hashes(): return [] @staticmethod - def dir_checksums(): + def dir_hashes(): return [] def load(self): @@ -54,13 +54,13 @@ def intersection(*args): class RemoteIndex: - """Class for indexing remote checksums in a sqlite3 database. + """Class for indexing remote hashes in a sqlite3 database. Args: repo: repo for this remote index. name: name for this index. Index db will be loaded from and saved to ``.dvc/tmp/index/{name}.idx``. - dir_suffix: suffix used for naming directory checksums + dir_suffix: suffix used for naming directory hashes """ INDEX_SUFFIX = ".idx" @@ -79,8 +79,8 @@ def __init__(self, repo, name, dir_suffix=".dir"): def __iter__(self): cmd = f"SELECT checksum FROM {self.INDEX_TABLE}" - for (checksum,) in self._execute(cmd): - yield checksum + for (hash_,) in self._execute(cmd): + yield hash_ def __enter__(self): self.lock.acquire() @@ -90,25 +90,25 @@ def __exit__(self, typ, value, tbck): self.dump() self.lock.release() - def __contains__(self, checksum): + def __contains__(self, hash_): cmd = "SELECT checksum FROM {} WHERE checksum = (?)".format( self.INDEX_TABLE ) - self._execute(cmd, (checksum,)) + self._execute(cmd, (hash_,)) return self.cursor.fetchone() is not None - def checksums(self): - """Iterate over checksums stored in the index.""" + def hashes(self): + """Iterate over hashes stored in the index.""" return iter(self) - def dir_checksums(self): - """Iterate over .dir checksums stored in the index.""" + def dir_hashes(self): + """Iterate over .dir hashes stored in the index.""" cmd = f"SELECT checksum FROM {self.INDEX_TABLE} WHERE dir = 1" - for (checksum,) in self._execute(cmd): - yield checksum + for (hash_,) in self._execute(cmd): + yield hash_ - def is_dir_checksum(self, checksum): - return checksum.endswith(self.dir_suffix) + def is_dir_hash(self, hash_): + return hash_.endswith(self.dir_suffix) def _execute(self, cmd, parameters=()): return self.cursor.execute(cmd, parameters) @@ -185,28 +185,24 @@ def clear(self): cmd = f"DELETE FROM {self.INDEX_TABLE}" self._execute(cmd) - def update(self, dir_checksums, file_checksums): - """Update this index, adding the specified checksums. + def update(self, dir_hashes, file_hashes): + """Update this index, adding the specified hashes. Changes to the index will not committed until dump() is called. """ cmd = "INSERT OR IGNORE INTO {} (checksum, dir) VALUES (?, ?)".format( self.INDEX_TABLE ) - self._executemany( - cmd, ((checksum, True) for checksum in dir_checksums) - ) - self._executemany( - cmd, ((checksum, False) for checksum in file_checksums) - ) + self._executemany(cmd, ((hash_, True) for hash_ in dir_hashes)) + self._executemany(cmd, ((hash_, False) for hash_ in file_hashes)) - def intersection(self, checksums): - """Iterate over values from `checksums` which exist in the index.""" + def intersection(self, hashes): + """Iterate over values from `hashes` which exist in the index.""" # sqlite has a compile time limit of 999, see: # https://www.sqlite.org/c3ref/c_limit_attached.html#sqlitelimitvariablenumber - for chunk in lchunks(999, checksums): + for chunk in lchunks(999, hashes): cmd = "SELECT checksum FROM {} WHERE checksum IN ({})".format( - self.INDEX_TABLE, ",".join("?" for checksum in chunk) + self.INDEX_TABLE, ",".join("?" for hash_ in chunk) ) - for (checksum,) in self._execute(cmd, chunk): - yield checksum + for (hash_,) in self._execute(cmd, chunk): + yield hash_ diff --git a/dvc/remote/local.py b/dvc/remote/local.py index 38889d35fe..7b0c4c445e 100644 --- a/dvc/remote/local.py +++ b/dvc/remote/local.py @@ -169,7 +169,7 @@ def hardlink(self, from_info, to_info): # and the cache type is `hardlink`, we might reach link limits and # will get something like: `too many links error` # - # This is because all those empty files will have the same checksum + # This is because all those empty files will have the same hash # (i.e. 68b329da9893e34099c7d8ad5cb9c940), therefore, they will be # linked to the same file in the cache. # @@ -268,7 +268,7 @@ def is_protected(self, path_info): return stat.S_IMODE(mode) == self.CACHE_MODE - def get_file_checksum(self, path_info): + def get_file_hash(self, path_info): return file_md5(path_info)[0] @staticmethod @@ -313,8 +313,8 @@ def list_paths(self, prefix=None, progress_callback=None): else: yield from walk_files(path_info) - def _remove_unpacked_dir(self, checksum): - info = self.checksum_to_path_info(checksum) + def _remove_unpacked_dir(self, hash_): + info = self.hash_to_path_info(hash_) path_info = info.with_name(info.name + self.UNPACKED_DIR_SUFFIX) self.remove(path_info) @@ -367,30 +367,28 @@ def supported(cls, config): def cache_path(self): return os.path.abspath(self.cache_dir) - def checksum_to_path(self, checksum): + def hash_to_path(self, hash_): # NOTE: `self.cache_path` is already normalized so we can simply use # `os.sep` instead of `os.path.join`. This results in this helper # being ~5.5 times faster. - return ( - f"{self.cache_path}{os.sep}{checksum[0:2]}{os.sep}{checksum[2:]}" - ) + return f"{self.cache_path}{os.sep}{hash_[0:2]}{os.sep}{hash_[2:]}" - def checksums_exist(self, checksums, jobs=None, name=None): + def hashes_exist(self, hashes, jobs=None, name=None): return [ - checksum - for checksum in Tqdm( - checksums, + hash_ + for hash_ in Tqdm( + hashes, unit="file", desc="Querying " + ("cache in " + name if name else "local cache"), ) - if not self.changed_cache_file(checksum) + if not self.changed_cache_file(hash_) ] def already_cached(self, path_info): assert path_info.scheme in ["", "local"] - current_md5 = self.get_checksum(path_info) + current_md5 = self.get_hash(path_info) if not current_md5: return False @@ -434,7 +432,7 @@ def _status( dir_status_info contains status for .dir files, file_status_info contains status for all other files, and dir_contents is a dict of - {dir_checksum: set(file_checksum, ...)} which can be used to map + {dir_hash: set(file_hash, ...)} which can be used to map a .dir file to its file contents. """ logger.debug(f"Preparing to collect status from {remote.path_info}") @@ -442,7 +440,7 @@ def _status( logger.debug("Collecting information from local cache...") local_exists = frozenset( - self.checksums_exist(md5s, jobs=jobs, name=self.cache_dir) + self.hashes_exist(md5s, jobs=jobs, name=self.cache_dir) ) # This is a performance optimization. We can safely assume that, @@ -457,12 +455,12 @@ def _status( dir_md5s = set(named_cache.dir_keys(self.scheme)) if dir_md5s: remote_exists.update( - self._indexed_dir_checksums(named_cache, remote, dir_md5s) + self._indexed_dir_hashes(named_cache, remote, dir_md5s) ) md5s.difference_update(remote_exists) if md5s: remote_exists.update( - remote.checksums_exist( + remote.hashes_exist( md5s, jobs=jobs, name=str(remote.path_info) ) ) @@ -473,23 +471,23 @@ def _status( def _make_status( self, named_cache, show_checksums, local_exists, remote_exists ): - def make_names(checksum, names): - return {"name": checksum if show_checksums else " ".join(names)} + def make_names(hash_, names): + return {"name": hash_ if show_checksums else " ".join(names)} dir_status = {} file_status = {} dir_contents = {} - for checksum, item in named_cache[self.scheme].items(): + for hash_, item in named_cache[self.scheme].items(): if item.children: - dir_status[checksum] = make_names(checksum, item.names) - dir_contents[checksum] = set() - for child_checksum, child in item.children.items(): - file_status[child_checksum] = make_names( - child_checksum, child.names + dir_status[hash_] = make_names(hash_, item.names) + dir_contents[hash_] = set() + for child_hash, child in item.children.items(): + file_status[child_hash] = make_names( + child_hash, child.names ) - dir_contents[checksum].add(child_checksum) + dir_contents[hash_].add(child_hash) else: - file_status[checksum] = make_names(checksum, item.names) + file_status[hash_] = make_names(hash_, item.names) self._fill_statuses(dir_status, local_exists, remote_exists) self._fill_statuses(file_status, local_exists, remote_exists) @@ -498,52 +496,50 @@ def make_names(checksum, names): return dir_status, file_status, dir_contents - def _indexed_dir_checksums(self, named_cache, remote, dir_md5s): - # Validate our index by verifying all indexed .dir checksums + def _indexed_dir_hashes(self, named_cache, remote, dir_md5s): + # Validate our index by verifying all indexed .dir hashes # still exist on the remote - indexed_dirs = set(remote.index.dir_checksums()) + indexed_dirs = set(remote.index.dir_hashes()) indexed_dir_exists = set() if indexed_dirs: indexed_dir_exists.update( - remote.tree.list_checksums_exists(indexed_dirs) + remote.tree.list_hashes_exists(indexed_dirs) ) missing_dirs = indexed_dirs.difference(indexed_dir_exists) if missing_dirs: logger.debug( - "Remote cache missing indexed .dir checksums '{}', " + "Remote cache missing indexed .dir hashes '{}', " "clearing remote index".format(", ".join(missing_dirs)) ) remote.index.clear() - # Check if non-indexed (new) dir checksums exist on remote + # Check if non-indexed (new) dir hashes exist on remote dir_exists = dir_md5s.intersection(indexed_dir_exists) dir_exists.update( - remote.tree.list_checksums_exists(dir_md5s - dir_exists) + remote.tree.list_hashes_exists(dir_md5s - dir_exists) ) - # If .dir checksum exists on the remote, assume directory contents + # If .dir hash exists on the remote, assume directory contents # still exists on the remote - for dir_checksum in dir_exists: - file_checksums = list( - named_cache.child_keys(self.scheme, dir_checksum) - ) - if dir_checksum not in remote.index: + for dir_hash in dir_exists: + file_hashes = list(named_cache.child_keys(self.scheme, dir_hash)) + if dir_hash not in remote.index: logger.debug( "Indexing new .dir '{}' with '{}' nested files".format( - dir_checksum, len(file_checksums) + dir_hash, len(file_hashes) ) ) - remote.index.update([dir_checksum], file_checksums) - yield dir_checksum - yield from file_checksums + remote.index.update([dir_hash], file_hashes) + yield dir_hash + yield from file_hashes @staticmethod - def _fill_statuses(checksum_info_dir, local_exists, remote_exists): + def _fill_statuses(hash_info_dir, local_exists, remote_exists): # Using sets because they are way faster for lookups local = set(local_exists) remote = set(remote_exists) - for md5, info in checksum_info_dir.items(): + for md5, info in hash_info_dir.items(): status = STATUS_MAP[(md5 in local, md5 in remote)] info["status"] = status @@ -551,15 +547,15 @@ def _get_plans(self, download, remote, status_info, status): cache = [] path_infos = [] names = [] - checksums = [] + hashes = [] for md5, info in Tqdm( status_info.items(), desc="Analysing status", unit="file" ): if info["status"] == status: - cache.append(self.checksum_to_path_info(md5)) - path_infos.append(remote.checksum_to_path_info(md5)) + cache.append(self.hash_to_path_info(md5)) + path_infos.append(remote.hash_to_path_info(md5)) names.append(info["name"]) - checksums.append(md5) + hashes.append(md5) if download: to_infos = cache @@ -568,7 +564,7 @@ def _get_plans(self, download, remote, status_info, status): to_infos = path_infos from_infos = cache - return from_infos, to_infos, names, checksums + return from_infos, to_infos, names, hashes def _process( self, @@ -630,20 +626,18 @@ def _process( # for uploads, push files first, and any .dir files last file_futures = {} - for from_info, to_info, name, checksum in zip(*file_plans): - file_futures[checksum] = executor.submit( + for from_info, to_info, name, hash_ in zip(*file_plans): + file_futures[hash_] = executor.submit( func, from_info, to_info, name ) dir_futures = {} - for from_info, to_info, name, dir_checksum in zip( - *dir_plans - ): + for from_info, to_info, name, dir_hash in zip(*dir_plans): wait_futures = { future - for file_checksum, future in file_futures.items() - if file_checksum in dir_contents[dir_checksum] + for file_hash, future in file_futures.items() + if file_hash in dir_contents[dir_hash] } - dir_futures[dir_checksum] = executor.submit( + dir_futures[dir_hash] = executor.submit( self._dir_upload, func, wait_futures, @@ -666,16 +660,14 @@ def _process( if not download: # index successfully pushed dirs - for dir_checksum, future in dir_futures.items(): + for dir_hash, future in dir_futures.items(): if future.result() == 0: - file_checksums = dir_contents[dir_checksum] + file_hashes = dir_contents[dir_hash] logger.debug( "Indexing pushed dir '{}' with " - "'{}' nested files".format( - dir_checksum, len(file_checksums) - ) + "'{}' nested files".format(dir_hash, len(file_hashes)) ) - remote.index.update([dir_checksum], file_checksums) + remote.index.update([dir_hash], file_hashes) return len(dir_plans[0]) + len(file_plans[0]) @@ -714,10 +706,10 @@ def pull(self, named_cache, remote, jobs=None, show_checksums=False): ) @staticmethod - def _log_missing_caches(checksum_info_dict): + def _log_missing_caches(hash_info_dict): missing_caches = [ (md5, info) - for md5, info in checksum_info_dict.items() + for md5, info in hash_info_dict.items() if info["status"] == STATUS_MISSING ] if missing_caches: diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index 19670a9787..9a1911da95 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -308,7 +308,7 @@ def _copy(cls, s3, from_info, to_info, extra_args): if etag != cached_etag: raise ETagMismatchError(etag, cached_etag) - def get_file_checksum(self, path_info): + def get_file_hash(self, path_info): return self.get_etag(self.s3, path_info.bucket, path_info.path) def _upload(self, from_file, to_info, name=None, no_progress_bar=False): diff --git a/dvc/remote/ssh/__init__.py b/dvc/remote/ssh/__init__.py index db355c6fd1..456e85f990 100644 --- a/dvc/remote/ssh/__init__.py +++ b/dvc/remote/ssh/__init__.py @@ -237,7 +237,7 @@ def reflink(self, from_info, to_info): with self.ssh(from_info) as ssh: ssh.reflink(from_info.path, to_info.path) - def get_file_checksum(self, path_info): + def get_file_hash(self, path_info): if path_info.scheme != self.scheme: raise NotImplementedError @@ -314,14 +314,14 @@ def _exists(chunk_and_channel): return results - def checksums_exist(self, checksums, jobs=None, name=None): + def hashes_exist(self, hashes, jobs=None, name=None): """This is older implementation used in remote/base.py We are reusing it in RemoteSSH, because SSH's batch_exists proved to be faster than current approach (relying on exists(path_info)) applied in remote/base. """ if not self.tree.CAN_TRAVERSE: - return list(set(checksums) & set(self.tree.all())) + return list(set(hashes) & set(self.tree.all())) # possibly prompt for credentials before "Querying" progress output self.tree.ensure_credentials() @@ -329,7 +329,7 @@ def checksums_exist(self, checksums, jobs=None, name=None): with Tqdm( desc="Querying " + ("cache in " + name if name else "remote cache"), - total=len(checksums), + total=len(hashes), unit="file", ) as pbar: @@ -339,9 +339,9 @@ def exists_with_progress(chunks): with ThreadPoolExecutor( max_workers=jobs or self.tree.JOBS ) as executor: - path_infos = [self.checksum_to_path_info(x) for x in checksums] + path_infos = [self.hash_to_path_info(x) for x in hashes] chunks = to_chunks(path_infos, num_chunks=self.tree.JOBS) results = executor.map(exists_with_progress, chunks) in_remote = itertools.chain.from_iterable(results) - ret = list(itertools.compress(checksums, in_remote)) + ret = list(itertools.compress(hashes, in_remote)) return ret diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 77d03978f7..ce8e6e8aee 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -37,7 +37,7 @@ def _to_path(output): def _to_checksum(output): if on_working_tree: - return self.cache.local.get_checksum(output.path_info) + return self.cache.local.get_hash(output.path_info) return output.checksum def _exists(output): diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index 1c83155b1a..728c1529b5 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -82,7 +82,7 @@ def open(self, path, mode="r", encoding="utf-8", remote=None): else: checksum = out.checksum try: - remote_info = remote_obj.checksum_to_path_info(checksum) + remote_info = remote_obj.hash_to_path_info(checksum) return remote_obj.open( remote_info, mode=mode, encoding=encoding ) @@ -93,7 +93,7 @@ def open(self, path, mode="r", encoding="utf-8", remote=None): if out.is_dir_checksum: checksum = self._get_granular_checksum(path, out) - cache_path = out.cache.checksum_to_path_info(checksum).url + cache_path = out.cache.hash_to_path_info(checksum).url else: cache_path = out.cache_path return open(cache_path, mode=mode, encoding=encoding) @@ -212,7 +212,7 @@ def isdvc(self, path, **kwargs): def isexec(self, path): return False - def get_file_checksum(self, path_info): + def get_file_hash(self, path_info): outs = self._find_outs(path_info, strict=False) if len(outs) != 1: raise OutputNotFoundError @@ -373,7 +373,7 @@ def walk_files(self, top, **kwargs): for fname in files: yield PathInfo(root) / fname - def get_file_checksum(self, path_info): + def get_file_hash(self, path_info): """Return file checksum for specified path. If path_info is a DVC out, the pre-computed checksum for the file @@ -384,7 +384,7 @@ def get_file_checksum(self, path_info): raise FileNotFoundError if self.dvctree and self.dvctree.exists(path_info): try: - return self.dvctree.get_file_checksum(path_info) + return self.dvctree.get_file_hash(path_info) except OutputNotFoundError: pass return file_md5(path_info, self)[0] diff --git a/tests/func/remote/test_index.py b/tests/func/remote/test_index.py index 9df9cf7e9c..0fdd510788 100644 --- a/tests/func/remote/test_index.py +++ b/tests/func/remote/test_index.py @@ -15,12 +15,12 @@ def remote(tmp_dir, dvc, tmp_path_factory, mocker): dvc.config["remote"]["upstream"] = {"url": url} dvc.config["core"]["remote"] = "upstream" - # patch checksums_exist since the LocalRemote normally overrides - # BaseRemoteTree.checksums_exist. - def checksums_exist(self, *args, **kwargs): - return Remote.checksums_exist(self, *args, **kwargs) + # patch hashes_exist since the LocalRemote normally overrides + # BaseRemoteTree.hashes_exist. + def hashes_exist(self, *args, **kwargs): + return Remote.hashes_exist(self, *args, **kwargs) - mocker.patch.object(LocalRemote, "checksums_exist", checksums_exist) + mocker.patch.object(LocalRemote, "hashes_exist", hashes_exist) # patch index class since LocalRemote normally overrides index class mocker.patch.object(LocalRemote, "INDEX_CLS", RemoteIndex) @@ -38,9 +38,9 @@ def test_indexed_on_status(tmp_dir, dvc, tmp_path_factory, remote): dvc.status(cloud=True) with remote.index: - assert {bar.checksum, baz["md5"]} == set(remote.index.checksums()) - assert [bar.checksum] == list(remote.index.dir_checksums()) - assert foo.checksum not in remote.index.checksums() + assert {bar.checksum, baz["md5"]} == set(remote.index.hashes()) + assert [bar.checksum] == list(remote.index.dir_hashes()) + assert foo.checksum not in remote.index.hashes() def test_indexed_on_push(tmp_dir, dvc, tmp_path_factory, remote): @@ -50,9 +50,9 @@ def test_indexed_on_push(tmp_dir, dvc, tmp_path_factory, remote): dvc.push() with remote.index: - assert {bar.checksum, baz["md5"]} == set(remote.index.checksums()) - assert [bar.checksum] == list(remote.index.dir_checksums()) - assert foo.checksum not in remote.index.checksums() + assert {bar.checksum, baz["md5"]} == set(remote.index.hashes()) + assert [bar.checksum] == list(remote.index.dir_hashes()) + assert foo.checksum not in remote.index.hashes() def test_indexed_dir_missing(tmp_dir, dvc, tmp_path_factory, remote): @@ -61,7 +61,7 @@ def test_indexed_dir_missing(tmp_dir, dvc, tmp_path_factory, remote): remote.index.update([bar.checksum], []) dvc.status(cloud=True) with remote.index: - assert not list(remote.index.checksums()) + assert not list(remote.index.hashes()) def test_clear_on_gc(tmp_dir, dvc, tmp_path_factory, remote, mocker): @@ -101,4 +101,4 @@ def unreliable_upload(self, from_file, to_info, name=None, **kwargs): with pytest.raises(UploadError): dvc.push() with remote.index: - assert not list(remote.index.checksums()) + assert not list(remote.index.hashes()) diff --git a/tests/func/test_add.py b/tests/func/test_add.py index 5a9965d9dd..d1f081733c 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -340,7 +340,7 @@ def test(self): def test_should_collect_dir_cache_only_once(mocker, tmp_dir, dvc): tmp_dir.gen({"data/data": "foo"}) - get_dir_checksum_counter = mocker.spy(LocalRemoteTree, "get_dir_checksum") + get_dir_hash_counter = mocker.spy(LocalRemoteTree, "get_dir_hash") ret = main(["add", "data"]) assert ret == 0 @@ -349,7 +349,7 @@ def test_should_collect_dir_cache_only_once(mocker, tmp_dir, dvc): ret = main(["status"]) assert ret == 0 - assert get_dir_checksum_counter.mock.call_count == 1 + assert get_dir_hash_counter.mock.call_count == 1 class SymlinkAddTestBase(TestDvc): diff --git a/tests/func/test_cache.py b/tests/func/test_cache.py index 7f7972dcf7..bbf787ae53 100644 --- a/tests/func/test_cache.py +++ b/tests/func/test_cache.py @@ -36,7 +36,7 @@ def test_all(self): self.assertIn(self.cache2_md5, md5_list) def test_get(self): - cache = Cache(self.dvc).local.checksum_to_path_info(self.cache1_md5) + cache = Cache(self.dvc).local.hash_to_path_info(self.cache1_md5) self.assertEqual(os.fspath(cache), self.cache1) @@ -46,16 +46,16 @@ def _do_test(self, ret): self.assertEqual(len(ret), 0) def test(self): - checksum = "123.dir" - fname = os.fspath(self.dvc.cache.local.checksum_to_path_info(checksum)) + dir_hash = "123.dir" + fname = os.fspath(self.dvc.cache.local.hash_to_path_info(dir_hash)) self.create(fname, "not,json") with pytest.raises(DirCacheError): - self.dvc.cache.local.load_dir_cache(checksum) + self.dvc.cache.local.load_dir_cache(dir_hash) - checksum = "234.dir" - fname = os.fspath(self.dvc.cache.local.checksum_to_path_info(checksum)) + dir_hash = "234.dir" + fname = os.fspath(self.dvc.cache.local.hash_to_path_info(dir_hash)) self.create(fname, '{"a": "b"}') - self._do_test(self.dvc.cache.local.load_dir_cache(checksum)) + self._do_test(self.dvc.cache.local.load_dir_cache(dir_hash)) class TestExternalCacheDir(TestDvc): diff --git a/tests/func/test_checkout.py b/tests/func/test_checkout.py index e83de5c804..c34b3a9f07 100644 --- a/tests/func/test_checkout.py +++ b/tests/func/test_checkout.py @@ -100,8 +100,8 @@ def test(self): # NOTE: modifying cache file for one of the files inside the directory # to check if dvc will detect that the cache is corrupted. entry = self.dvc.cache.local.load_dir_cache(out.checksum)[0] - checksum = entry[self.dvc.cache.local.tree.PARAM_CHECKSUM] - cache = os.fspath(self.dvc.cache.local.checksum_to_path_info(checksum)) + entry_hash = entry[self.dvc.cache.local.tree.PARAM_CHECKSUM] + cache = os.fspath(self.dvc.cache.local.hash_to_path_info(entry_hash)) os.chmod(cache, 0o644) with open(cache, "w+") as fobj: diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index eda784b6c2..8400b846ba 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -581,10 +581,8 @@ def _test_recursive_fetch(self, data_md5, data_sub_md5): self._clear_local_cache() local_cache = self.dvc.cache.local - local_cache_data_path = local_cache.checksum_to_path_info(data_md5) - local_cache_data_sub_path = local_cache.checksum_to_path_info( - data_sub_md5 - ) + local_cache_data_path = local_cache.hash_to_path_info(data_md5) + local_cache_data_sub_path = local_cache.hash_to_path_info(data_sub_md5) self.assertFalse(os.path.exists(local_cache_data_path)) self.assertFalse(os.path.exists(local_cache_data_sub_path)) @@ -596,8 +594,8 @@ def _test_recursive_fetch(self, data_md5, data_sub_md5): def _test_recursive_push(self, data_md5, data_sub_md5): remote = self.cloud.get_remote() - cloud_data_path = remote.checksum_to_path_info(data_md5) - cloud_data_sub_path = remote.checksum_to_path_info(data_sub_md5) + cloud_data_path = remote.hash_to_path_info(data_md5) + cloud_data_sub_path = remote.hash_to_path_info(data_sub_md5) self.assertFalse(os.path.exists(cloud_data_path)) self.assertFalse(os.path.exists(cloud_data_sub_path)) @@ -621,9 +619,9 @@ def test(self): self._test_recursive_pull() -def test_checksum_recalculation(mocker, dvc, tmp_dir): +def test_hash_recalculation(mocker, dvc, tmp_dir): tmp_dir.gen({"foo": "foo"}) - test_get_file_checksum = mocker.spy(LocalRemoteTree, "get_file_checksum") + test_get_file_hash = mocker.spy(LocalRemoteTree, "get_file_hash") url = Local.get_url() ret = main(["remote", "add", "-d", TEST_REMOTE, url]) assert ret == 0 @@ -635,7 +633,7 @@ def test_checksum_recalculation(mocker, dvc, tmp_dir): assert ret == 0 ret = main(["run", "--single-stage", "-d", "foo", "echo foo"]) assert ret == 0 - assert test_get_file_checksum.mock.call_count == 1 + assert test_get_file_hash.mock.call_count == 1 class TestShouldWarnOnNoChecksumInLocalAndRemoteCache(TestDvc): @@ -688,7 +686,7 @@ def test(self): assert self.message_bar_part in self._caplog.text -def test_verify_checksums( +def test_verify_hashes( tmp_dir, scm, dvc, mocker, tmp_path_factory, setup_remote ): @@ -702,10 +700,10 @@ def test_verify_checksums( remove("dir") remove(dvc.cache.local.cache_dir) - checksum_spy = mocker.spy(dvc.cache.local.tree, "get_file_checksum") + hash_spy = mocker.spy(dvc.cache.local.tree, "get_file_hash") dvc.pull() - assert checksum_spy.call_count == 0 + assert hash_spy.call_count == 0 # Removing cache will invalidate existing state entries remove(dvc.cache.local.cache_dir) @@ -713,7 +711,7 @@ def test_verify_checksums( dvc.config["remote"]["upstream"]["verify"] = True dvc.pull() - assert checksum_spy.call_count == 3 + assert hash_spy.call_count == 3 @flaky(max_runs=3, min_passes=1) diff --git a/tests/func/test_gc.py b/tests/func/test_gc.py index 52c16adf9b..847fc1529e 100644 --- a/tests/func/test_gc.py +++ b/tests/func/test_gc.py @@ -21,7 +21,7 @@ def setUp(self): self.dvc.add(self.FOO) self.dvc.add(self.DATA_DIR) self.good_cache = [ - self.dvc.cache.local.checksum_to_path_info(md5) + self.dvc.cache.local.hash_to_path_info(md5) for md5 in self.dvc.cache.local.tree.all() ] diff --git a/tests/func/test_remote.py b/tests/func/test_remote.py index bb5c61ad23..a424224cd9 100644 --- a/tests/func/test_remote.py +++ b/tests/func/test_remote.py @@ -145,7 +145,7 @@ def test(self): self.assertEqual(ret, 0) -def test_dir_checksum_should_be_key_order_agnostic(tmp_dir, dvc): +def test_dir_hash_should_be_key_order_agnostic(tmp_dir, dvc): tmp_dir.gen({"data": {"1": "1 content", "2": "2 content"}}) path_info = PathInfo("data") @@ -158,7 +158,7 @@ def test_dir_checksum_should_be_key_order_agnostic(tmp_dir, dvc): {"relpath": "2", "md5": "2"}, ], ): - checksum1 = dvc.cache.local.get_checksum(path_info) + hash1 = dvc.cache.local.get_hash(path_info) with patch.object( BaseRemoteTree, @@ -168,9 +168,9 @@ def test_dir_checksum_should_be_key_order_agnostic(tmp_dir, dvc): {"md5": "2", "relpath": "2"}, ], ): - checksum2 = dvc.cache.local.get_checksum(path_info) + hash2 = dvc.cache.local.get_hash(path_info) - assert checksum1 == checksum2 + assert hash1 == hash2 def test_partial_push_n_pull(tmp_dir, dvc, tmp_path_factory, setup_remote): @@ -194,13 +194,9 @@ def unreliable_upload(self, from_file, to_info, name=None, **kwargs): assert upload_error_info.value.amount == 3 remote = dvc.cloud.get_remote("upstream") - assert not remote.tree.exists( - remote.checksum_to_path_info(foo.checksum) - ) - assert remote.tree.exists(remote.checksum_to_path_info(bar.checksum)) - assert not remote.tree.exists( - remote.checksum_to_path_info(baz.checksum) - ) + assert not remote.tree.exists(remote.hash_to_path_info(foo.checksum)) + assert remote.tree.exists(remote.hash_to_path_info(bar.checksum)) + assert not remote.tree.exists(remote.hash_to_path_info(baz.checksum)) # Push everything and delete local cache dvc.push() @@ -397,7 +393,7 @@ def test_protect_local_remote(tmp_dir, dvc, setup_remote): dvc.push() remote = dvc.cloud.get_remote("upstream") - remote_cache_file = remote.checksum_to_path_info(stage.outs[0].checksum) + remote_cache_file = remote.hash_to_path_info(stage.outs[0].checksum) assert os.path.exists(remote_cache_file) assert stat.S_IMODE(os.stat(remote_cache_file).st_mode) == 0o444 diff --git a/tests/func/test_tree.py b/tests/func/test_tree.py index 7c7dd0ed59..b08b014e45 100644 --- a/tests/func/test_tree.py +++ b/tests/func/test_tree.py @@ -192,8 +192,8 @@ def test_repotree_walk_fetch(tmp_dir, dvc, scm, setup_remote): assert os.path.exists(out.cache_path) for entry in out.dir_cache: - checksum = entry[out.remote.tree.PARAM_CHECKSUM] - assert os.path.exists(dvc.cache.local.checksum_to_path_info(checksum)) + hash_ = entry[out.remote.tree.PARAM_CHECKSUM] + assert os.path.exists(dvc.cache.local.hash_to_path_info(hash_)) def test_repotree_cache_save(tmp_dir, dvc, scm, erepo_dir, setup_remote): @@ -211,7 +211,7 @@ def test_repotree_cache_save(tmp_dir, dvc, scm, erepo_dir, setup_remote): # into dvc.cache, not fetched or streamed from a remote tree = RepoTree(erepo_dir.dvc, stream=True) expected = [ - tree.get_file_checksum(erepo_dir / path) + tree.get_file_hash(erepo_dir / path) for path in ("dir/bar", "dir/subdir/foo") ] @@ -219,8 +219,8 @@ def test_repotree_cache_save(tmp_dir, dvc, scm, erepo_dir, setup_remote): cache = dvc.cache.local with cache.state: cache.save(PathInfo(erepo_dir / "dir"), tree, None) - for checksum in expected: - assert os.path.exists(cache.checksum_to_path_info(checksum)) + for hash_ in expected: + assert os.path.exists(cache.hash_to_path_info(hash_)) def test_cleantree_subrepo(tmp_dir, dvc, scm, monkeypatch): diff --git a/tests/unit/remote/test_azure.py b/tests/unit/remote/test_azure.py index 0c6edaac06..360ab3927f 100644 --- a/tests/unit/remote/test_azure.py +++ b/tests/unit/remote/test_azure.py @@ -32,7 +32,7 @@ def test_init(dvc): assert tree.connection_string == connection_string -def test_get_file_checksum(tmp_dir): +def test_get_file_hash(tmp_dir): if not Azure.should_test(): pytest.skip("no azurite running") @@ -42,7 +42,7 @@ def test_get_file_checksum(tmp_dir): to_info = tree.PATH_CLS(Azure.get_url()) tree.upload(PathInfo("foo"), to_info) assert tree.exists(to_info) - checksum = tree.get_file_checksum(to_info) - assert checksum - assert isinstance(checksum, str) - assert checksum.strip("'").strip('"') == checksum + hash_ = tree.get_file_hash(to_info) + assert hash_ + assert isinstance(hash_, str) + assert hash_.strip("'").strip('"') == hash_ diff --git a/tests/unit/remote/test_base.py b/tests/unit/remote/test_base.py index e560452a62..1d6bbe4001 100644 --- a/tests/unit/remote/test_base.py +++ b/tests/unit/remote/test_base.py @@ -45,19 +45,19 @@ def test_cmd_error(dvc): BaseRemoteTree(dvc, config).remove("file") -@mock.patch.object(BaseRemoteTree, "list_checksums_traverse") -@mock.patch.object(BaseRemoteTree, "list_checksums_exists") -def test_checksums_exist(object_exists, traverse, dvc): +@mock.patch.object(BaseRemoteTree, "list_hashes_traverse") +@mock.patch.object(BaseRemoteTree, "list_hashes_exists") +def test_hashes_exist(object_exists, traverse, dvc): remote = Remote(BaseRemoteTree(dvc, {})) # remote does not support traverse remote.tree.CAN_TRAVERSE = False with mock.patch.object( - remote.tree, "list_checksums", return_value=list(range(256)) + remote.tree, "list_hashes", return_value=list(range(256)) ): - checksums = set(range(1000)) - remote.checksums_exist(checksums) - object_exists.assert_called_with(checksums, None, None) + hashes = set(range(1000)) + remote.hashes_exist(hashes) + object_exists.assert_called_with(hashes, None, None) traverse.assert_not_called() remote.tree.CAN_TRAVERSE = True @@ -66,19 +66,19 @@ def test_checksums_exist(object_exists, traverse, dvc): object_exists.reset_mock() traverse.reset_mock() with mock.patch.object( - remote.tree, "list_checksums", return_value=list(range(256)) + remote.tree, "list_hashes", return_value=list(range(256)) ): - checksums = list(range(1000)) - remote.checksums_exist(checksums) + hashes = list(range(1000)) + remote.hashes_exist(hashes) # verify that _cache_paths_with_max() short circuits - # before returning all 256 remote checksums - max_checksums = math.ceil( - remote.tree._max_estimation_size(checksums) + # before returning all 256 remote hashes + max_hashes = math.ceil( + remote.tree._max_estimation_size(hashes) / pow(16, remote.tree.TRAVERSE_PREFIX_LEN) ) - assert max_checksums < 256 + assert max_hashes < 256 object_exists.assert_called_with( - frozenset(range(max_checksums, 1000)), None, None + frozenset(range(max_hashes, 1000)), None, None ) traverse.assert_not_called() @@ -87,10 +87,10 @@ def test_checksums_exist(object_exists, traverse, dvc): traverse.reset_mock() remote.tree.JOBS = 16 with mock.patch.object( - remote.tree, "list_checksums", return_value=list(range(256)) + remote.tree, "list_hashes", return_value=list(range(256)) ): - checksums = list(range(1000000)) - remote.checksums_exist(checksums) + hashes = list(range(1000000)) + remote.hashes_exist(hashes) object_exists.assert_not_called() traverse.assert_called_with( 256 * pow(16, remote.tree.TRAVERSE_PREFIX_LEN), @@ -101,50 +101,50 @@ def test_checksums_exist(object_exists, traverse, dvc): @mock.patch.object( - BaseRemoteTree, "list_checksums", return_value=[], + BaseRemoteTree, "list_hashes", return_value=[], ) @mock.patch.object( - BaseRemoteTree, "path_to_checksum", side_effect=lambda x: x, + BaseRemoteTree, "path_to_hash", side_effect=lambda x: x, ) -def test_list_checksums_traverse(path_to_checksum, list_checksums, dvc): +def test_list_hashes_traverse(path_to_hash, list_hashes, dvc): tree = BaseRemoteTree(dvc, {}) tree.path_info = PathInfo("foo") # parallel traverse size = 256 / tree.JOBS * tree.LIST_OBJECT_PAGE_SIZE - list(tree.list_checksums_traverse(size, {0})) + list(tree.list_hashes_traverse(size, {0})) for i in range(1, 16): - list_checksums.assert_any_call( + list_hashes.assert_any_call( prefix=f"{i:03x}", progress_callback=CallableOrNone ) for i in range(1, 256): - list_checksums.assert_any_call( + list_hashes.assert_any_call( prefix=f"{i:02x}", progress_callback=CallableOrNone ) # default traverse (small remote) size -= 1 - list_checksums.reset_mock() - list(tree.list_checksums_traverse(size - 1, {0})) - list_checksums.assert_called_with( + list_hashes.reset_mock() + list(tree.list_hashes_traverse(size - 1, {0})) + list_hashes.assert_called_with( prefix=None, progress_callback=CallableOrNone ) -def test_list_checksums(dvc): +def test_list_hashes(dvc): tree = BaseRemoteTree(dvc, {}) tree.path_info = PathInfo("foo") with mock.patch.object( tree, "list_paths", return_value=["12/3456", "bar"] ): - checksums = list(tree.list_checksums()) - assert checksums == ["123456"] + hashes = list(tree.list_hashes()) + assert hashes == ["123456"] @pytest.mark.parametrize( - "checksum, result", + "hash_, result", [(None, False), ("", False), ("3456.dir", True), ("3456", False)], ) -def test_is_dir_checksum(checksum, result): - assert BaseRemoteTree.is_dir_checksum(checksum) == result +def test_is_dir_hash(hash_, result): + assert BaseRemoteTree.is_dir_hash(hash_) == result diff --git a/tests/unit/remote/test_index.py b/tests/unit/remote/test_index.py index 2331f0b252..7e475b3aaa 100644 --- a/tests/unit/remote/test_index.py +++ b/tests/unit/remote/test_index.py @@ -19,9 +19,9 @@ def test_init(dvc, index): assert str(index.path) == os.path.join(dvc.index_dir, "foo.idx") -def test_is_dir_checksum(dvc, index): - assert index.is_dir_checksum("foo.dir") - assert not index.is_dir_checksum("foo") +def test_is_dir_hash(dvc, index): + assert index.is_dir_hash("foo.dir") + assert not index.is_dir_hash("foo") def test_roundtrip(dvc, index): @@ -30,8 +30,8 @@ def test_roundtrip(dvc, index): index.update(expected_dir, expected_file) index.dump() index.load() - assert set(index.dir_checksums()) == expected_dir - assert set(index.checksums()) == expected_dir | expected_file + assert set(index.dir_hashes()) == expected_dir + assert set(index.hashes()) == expected_dir | expected_file def test_clear(dvc, index): @@ -39,19 +39,19 @@ def test_clear(dvc, index): ["1234.dir"], ["5678"], ) index.clear() - assert first(index.checksums()) is None + assert first(index.hashes()) is None def test_update(dvc, index): expected_dir = {"1234.dir"} expected_file = {"5678"} index.update(expected_dir, expected_file) - assert set(index.dir_checksums()) == expected_dir - assert set(index.checksums()) == expected_dir | expected_file + assert set(index.dir_hashes()) == expected_dir + assert set(index.hashes()) == expected_dir | expected_file def test_intersection(dvc, index): - checksums = (str(i) for i in range(2000)) + hashes = (str(i) for i in range(2000)) expected = {str(i) for i in range(1000)} - index.update([], checksums) + index.update([], hashes) assert set(index.intersection(expected)) == expected diff --git a/tests/unit/remote/test_local.py b/tests/unit/remote/test_local.py index 28d3716189..24aafc1be5 100644 --- a/tests/unit/remote/test_local.py +++ b/tests/unit/remote/test_local.py @@ -21,16 +21,16 @@ def test_status_download_optimization(mocker, dvc): infos.add("local", "37b51d194a7513e45b56f6524f2d51f2", "bar") local_exists = list(infos["local"]) - mocker.patch.object(cache, "checksums_exist", return_value=local_exists) + mocker.patch.object(cache, "hashes_exist", return_value=local_exists) other_remote = mocker.Mock() other_remote.url = "other_remote" - other_remote.checksums_exist.return_value = [] + other_remote.hashes_exist.return_value = [] other_remote.index = RemoteIndexNoop() cache.status(infos, other_remote, download=True) - assert other_remote.checksums_exist.call_count == 0 + assert other_remote.hashes_exist.call_count == 0 @pytest.mark.parametrize("link_name", ["hardlink", "symlink"]) diff --git a/tests/unit/remote/test_remote.py b/tests/unit/remote/test_remote.py index a748bde825..7b2677759e 100644 --- a/tests/unit/remote/test_remote.py +++ b/tests/unit/remote/test_remote.py @@ -5,30 +5,30 @@ from dvc.remote.s3 import S3RemoteTree -def test_remote_with_checksum_jobs(dvc): - dvc.config["remote"]["with_checksum_jobs"] = { +def test_remote_with_hash_jobs(dvc): + dvc.config["remote"]["with_hash_jobs"] = { "url": "s3://bucket/name", - "checksum_jobs": 100, + "hash_jobs": 100, } - dvc.config["core"]["checksum_jobs"] = 200 + dvc.config["core"]["hash_jobs"] = 200 - tree = get_cloud_tree(dvc, name="with_checksum_jobs") - assert tree.checksum_jobs == 100 + tree = get_cloud_tree(dvc, name="with_hash_jobs") + assert tree.hash_jobs == 100 -def test_remote_without_checksum_jobs(dvc): - dvc.config["remote"]["without_checksum_jobs"] = {"url": "s3://bucket/name"} - dvc.config["core"]["checksum_jobs"] = 200 +def test_remote_without_hash_jobs(dvc): + dvc.config["remote"]["without_hash_jobs"] = {"url": "s3://bucket/name"} + dvc.config["core"]["hash_jobs"] = 200 - tree = get_cloud_tree(dvc, name="without_checksum_jobs") - assert tree.checksum_jobs == 200 + tree = get_cloud_tree(dvc, name="without_hash_jobs") + assert tree.hash_jobs == 200 -def test_remote_without_checksum_jobs_default(dvc): - dvc.config["remote"]["without_checksum_jobs"] = {"url": "s3://bucket/name"} +def test_remote_without_hash_jobs_default(dvc): + dvc.config["remote"]["without_hash_jobs"] = {"url": "s3://bucket/name"} - tree = get_cloud_tree(dvc, name="without_checksum_jobs") - assert tree.checksum_jobs == tree.CHECKSUM_JOBS + tree = get_cloud_tree(dvc, name="without_hash_jobs") + assert tree.hash_jobs == tree.HASH_JOBS @pytest.mark.parametrize("tree_cls", [GSRemoteTree, S3RemoteTree])