Skip to content

Commit

Permalink
index: introduce build_data_index
Browse files Browse the repository at this point in the history
This is a generally useful function that builds a data index from repo index.

This is currently only used in cloud versioning, but it is going to be used in
`status`, `diff` and other data-centric operations
  • Loading branch information
efiop committed Jan 14, 2023
1 parent fc1a192 commit aabdddc
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 21 deletions.
48 changes: 48 additions & 0 deletions dvc/repo/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from dvc_data.hashfile.hash_info import HashInfo
from dvc_data.index import DataIndex, DataIndexKey, DataIndexView
from dvc_objects.db import ObjectDB
from dvc_objects.fs.base import FileSystem


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -447,3 +448,50 @@ def key_filter(workspace: str, key: "DataIndexKey"):
else:
data[workspace] = DataIndex()
return data


def build_data_index(
index: Union["Index", "IndexView"],
path: str,
fs: "FileSystem",
workspace: Optional[str] = "repo",
) -> "DataIndex":
from dvc_data.index import DataIndex, DataIndexEntry
from dvc_data.index.build import build_entries, build_entry

data = DataIndex()
for out in index.outs:
if not out.use_cache:
continue

ws, key = out.index_key
if ws != workspace:
continue

parts = out.fs.path.relparts(out.fs_path, out.repo.root_dir)
out_path = fs.path.join(path, *parts)

try:
entry = build_entry(out_path, fs)
except FileNotFoundError:
entry = DataIndexEntry(path=out_path, fs=fs)

entry.key = key

if not entry.meta or not entry.meta.isdir:
data.add(entry)
continue

entry.loaded = True
data.add(entry)

for entry in build_entries(out_path, fs):
if not entry.key or entry.key == ("",):
# NOTE: whether the root will be returned by build_entries
# depends on the filesystem (e.g. local doesn't, but s3 does).
entry.key = key
else:
entry.key = key + entry.key
data.add(entry)

return data
23 changes: 2 additions & 21 deletions dvc/repo/worktree.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def push_worktree(
jobs: Optional[int] = None,
**kwargs,
) -> int:
from dvc.repo.index import build_data_index
from dvc_data.index import checkout

view = worktree_view(
Expand All @@ -108,7 +109,7 @@ def push_worktree(
new_index = view.data["repo"]
if remote.worktree:
logger.debug("indexing latest worktree for '%s'", remote.path)
old_index = _build_worktree_index(repo, remote, view)
old_index = build_data_index(view, remote.path, remote.fs)
logger.debug("Pushing worktree changes to '%s'", remote.path)
else:
old_index = None
Expand Down Expand Up @@ -147,26 +148,6 @@ def push_worktree(
return pushed


def _build_worktree_index(
repo: "Repo", remote: "Remote", view: "IndexView"
) -> "DataIndex":
from dvc_data.index import DataIndex
from dvc_data.index.build import build_entries

index = DataIndex()
for out in view.outs:
_workspace, key = out.index_key
parts = out.fs.path.relparts(out.fs_path, repo.root_dir)
path = remote.fs.path.join(remote.path, *parts)
for entry in build_entries(path, remote.fs):
if not entry.key or entry.key == ("",):
entry.key = key
else:
entry.key = key + entry.key
index.add(entry)
return index


def _update_out_meta(
out: "Output", index: Union["DataIndex", "DataIndexView"]
):
Expand Down

0 comments on commit aabdddc

Please sign in to comment.