From 8c484645fa74db8824c252f21b9311411b47f13e Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Mon, 26 Sep 2022 02:36:30 +0300 Subject: [PATCH] cloud-versioning: better handling for directories --- dvc/dvcfile.py | 8 ++++---- dvc/output.py | 7 ++----- dvc/repo/fetch.py | 19 +------------------ dvc/repo/index.py | 5 ++++- dvc/repo/push.py | 6 ++---- dvc/stage/loader.py | 1 + dvc/stage/serialize.py | 13 ++++++++++--- setup.cfg | 2 +- 8 files changed, 25 insertions(+), 36 deletions(-) diff --git a/dvc/dvcfile.py b/dvc/dvcfile.py index 7cd136f3ed..c409aebb26 100644 --- a/dvc/dvcfile.py +++ b/dvc/dvcfile.py @@ -237,10 +237,10 @@ def dump( self._dump_pipeline_file(stage) if update_lock: - self._dump_lockfile(stage) + self._dump_lockfile(stage, **kwargs) - def _dump_lockfile(self, stage): - self._lockfile.dump(stage) + def _dump_lockfile(self, stage, **kwargs): + self._lockfile.dump(stage, **kwargs) @staticmethod def _check_if_parametrized(stage, action: str = "dump") -> None: @@ -366,7 +366,7 @@ def latest_version_info(self): return {SCHEMA_KWD: version} def dump(self, stage, **kwargs): - stage_data = serialize.to_lockfile(stage) + stage_data = serialize.to_lockfile(stage, **kwargs) with modify_yaml(self.path, fs=self.repo.fs) as data: version = LOCKFILE_VERSION.from_dict(data) diff --git a/dvc/output.py b/dvc/output.py index e4277a3864..a883238164 100644 --- a/dvc/output.py +++ b/dvc/output.py @@ -765,11 +765,8 @@ def dumpd(self, **kwargs): if self.remote: ret[self.PARAM_REMOTE] = self.remote - if ( - self.use_cache - and self.is_in_repo - and self.hash_info.isdir - and (kwargs.get("with_files") or self.files is not None) + if self.hash_info.isdir and ( + kwargs.get("with_files") or self.files is not None ): if self.obj: obj = self.obj diff --git a/dvc/repo/fetch.py b/dvc/repo/fetch.py index fd9c0d16ac..8166ae75e5 100644 --- a/dvc/repo/fetch.py +++ b/dvc/repo/fetch.py @@ -15,7 +15,7 @@ def _fetch_worktree(repo, remote): - from dvc_data.index import md5, save + from dvc_data.index import save index = repo.index.data["repo"] for key, entry in index.iteritems(): @@ -24,24 +24,7 @@ def _fetch_worktree(repo, remote): remote.path, *key, ) - md5(index) save(index) - - for stage in repo.index.stages: - for out in stage.outs: - if not out.use_cache: - continue - - if not out.is_in_repo: - continue - - workspace, key = out.index_key - entry = repo.index.data[workspace][key] - out.hash_info = entry.hash_info - out.meta = entry.meta - - stage.dvcfile.dump(stage) - return len(index) diff --git a/dvc/repo/index.py b/dvc/repo/index.py index 0c991e1837..355ff4dcba 100644 --- a/dvc/repo/index.py +++ b/dvc/repo/index.py @@ -192,9 +192,12 @@ def data(self) -> "Dict[str, DataIndex]": data_index = by_workspace[workspace] + if out.files: + out.obj = out.get_obj() + data_index[key] = DataIndexEntry( meta=out.meta, - obj=out.get_obj() if out.files else out.obj, + obj=out.obj, hash_info=out.hash_info, odb=out.odb, cache=out.odb, diff --git a/dvc/repo/push.py b/dvc/repo/push.py index 21841b23c5..2075c004f8 100644 --- a/dvc/repo/push.py +++ b/dvc/repo/push.py @@ -11,13 +11,11 @@ def _push_worktree(repo, remote): from dvc_data.hashfile.tree import tree_from_index - from dvc_data.index import build, checkout, collect + from dvc_data.index import checkout, collect index = repo.index.data["repo"] checkout(index, remote.path, remote.fs, force=True) - build(index, remote.path, remote.fs) - if any(out.isdir() for out in repo.index.outs): - collect(index, remote.path, remote.fs) + collect(index, remote.path, remote.fs, update=True) for stage in repo.index.stages: for out in stage.outs: diff --git a/dvc/stage/loader.py b/dvc/stage/loader.py index b9e1b2814e..7e4ff45fd9 100644 --- a/dvc/stage/loader.py +++ b/dvc/stage/loader.py @@ -73,6 +73,7 @@ def fill_from_lock(stage, lock_data=None): item.hash_info = HashInfo(item.hash_name, hash_value) if item.hash_info and item.hash_info.isdir: item.meta.isdir = True + item.files = get_in(checksums, [key, path, item.PARAM_FILES]) @classmethod def load_stage(cls, dvcfile, name, stage_data, lock_data=None): diff --git a/dvc/stage/serialize.py b/dvc/stage/serialize.py index b3da928eb0..0b3c93ef7b 100644 --- a/dvc/stage/serialize.py +++ b/dvc/stage/serialize.py @@ -152,7 +152,7 @@ def to_pipeline_file(stage: "PipelineStage"): } -def to_single_stage_lockfile(stage: "Stage") -> dict: +def to_single_stage_lockfile(stage: "Stage", **kwargs) -> dict: assert stage.cmd def _dumpd(item): @@ -164,6 +164,13 @@ def _dumpd(item): *meta_d.items(), ] + if item.hash_info.isdir and kwargs.get("with_files"): + if item.obj: + obj = item.obj + else: + obj = item.get_obj() + ret.append((item.PARAM_FILES, obj.as_list(with_meta=True))) + return OrderedDict(ret) res = OrderedDict([("cmd", stage.cmd)]) @@ -183,9 +190,9 @@ def _dumpd(item): return res -def to_lockfile(stage: "PipelineStage") -> dict: +def to_lockfile(stage: "PipelineStage", **kwargs) -> dict: assert stage.name - return {stage.name: to_single_stage_lockfile(stage)} + return {stage.name: to_single_stage_lockfile(stage, **kwargs)} def to_single_stage_file(stage: "Stage", **kwargs): diff --git a/setup.cfg b/setup.cfg index 6e6047344b..df55840111 100644 --- a/setup.cfg +++ b/setup.cfg @@ -67,7 +67,7 @@ install_requires = dvc-render==0.0.11 dvc-task==0.1.2 dvclive>=0.10.0 - dvc-data==0.12.0 + dvc-data==0.13.0 dvc-http==2.27.2 hydra-core>=1.1.0 iterative-telemetry==0.0.5