Skip to content

Commit

Permalink
dvcfs: implement basic du
Browse files Browse the repository at this point in the history
  • Loading branch information
efiop committed Dec 12, 2023
1 parent 3c31234 commit 8ccae0a
Show file tree
Hide file tree
Showing 7 changed files with 237 additions and 0 deletions.
2 changes: 2 additions & 0 deletions dvc/cli/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
data_sync,
destroy,
diff,
du,
experiments,
freeze,
gc,
Expand Down Expand Up @@ -95,6 +96,7 @@
data,
artifacts,
studio,
du,
]


Expand Down
80 changes: 80 additions & 0 deletions dvc/commands/du.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import argparse
import logging

from dvc.cli import completion
from dvc.cli.command import CmdBaseNoRepo
from dvc.cli.utils import DictAction, append_doc_link
from dvc.ui import ui

logger = logging.getLogger(__name__)


class CmdDU(CmdBaseNoRepo):
def run(self):
from dvc.repo import Repo
from dvc.utils.humanize import naturalsize

entries = Repo.du(
self.args.url,
self.args.path,
rev=self.args.rev,
summarize=self.args.summarize,
config=self.args.config,
remote=self.args.remote,
remote_config=self.args.remote_config,
)
ui.table([(naturalsize(size), path) for path, size in entries])
return 0


def add_parser(subparsers, parent_parser):
DU_HELP = "Show disk usage."
du_parser = subparsers.add_parser(
"du",
parents=[parent_parser],
description=append_doc_link(DU_HELP, "du"),
help=DU_HELP,
formatter_class=argparse.RawTextHelpFormatter,
)
du_parser.add_argument("url", help="Location of DVC repository")
du_parser.add_argument(
"--rev",
nargs="?",
help="Git revision (e.g. SHA, branch, tag)",
metavar="<commit>",
)
du_parser.add_argument(
"-s",
"--summarize",
action="store_true",
help="Show total disk usage.",
)
du_parser.add_argument(
"--config",
type=str,
help=(
"Path to a config file that will be merged with the config "
"in the target repository."
),
)
du_parser.add_argument(
"--remote",
type=str,
help="Remote name to set as a default in the target repository.",
)
du_parser.add_argument(
"--remote-config",
type=str,
nargs="*",
action=DictAction,
help=(
"Remote config options to merge with a remote's config (default or one "
"specified by '--remote') in the target repository."
),
)
du_parser.add_argument(
"path",
nargs="?",
help="Path to directory within the repository",
).complete = completion.DIR
du_parser.set_defaults(func=CmdDU)
41 changes: 41 additions & 0 deletions dvc/fs/dvc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
import posixpath
import threading
from collections import deque
from contextlib import ExitStack, suppress
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Type, Union

Expand Down Expand Up @@ -60,6 +61,7 @@ def _merge_info(repo, key, fs_info, dvc_info):
if fs_info:
ret["type"] = fs_info["type"]
ret["size"] = fs_info["size"]
ret["fs_info"] = fs_info
isexec = False
if fs_info["type"] == "file":
isexec = utils.is_exec(fs_info["mode"])
Expand Down Expand Up @@ -421,6 +423,45 @@ def get_file(self, rpath, lpath, **kwargs):
dvc_path = _get_dvc_path(dvc_fs, subkey)
return dvc_fs.get_file(dvc_path, lpath, **kwargs)

def du(self, path, total=True, maxdepth=None, withdirs=False, **kwargs):
if maxdepth is not None:
raise NotImplementedError

sizes = {}
dus = {}
todo = deque([self.info(path)])
while todo:
info = todo.popleft()
isdir = info["type"] == "directory"
size = info["size"] or 0
name = info["name"]

if not isdir:
sizes[name] = size
continue

dvc_info = info.get("dvc_info") or {}
fs_info = info.get("fs_info")
entry = dvc_info.get("entry")
if (
dvc_info
and not fs_info
and entry is not None
and entry.size is not None
):
dus[name] = entry.size
continue

if withdirs:
sizes[name] = size

todo.extend(self.ls(info["name"], detail=True))

if total:
return sum(sizes.values()) + sum(dus.values())

return sizes

def close(self):
self._repo_stack.close()

Expand Down
2 changes: 2 additions & 0 deletions dvc/repo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ class Repo:
from dvc.repo.commit import commit # type: ignore[misc]
from dvc.repo.destroy import destroy # type: ignore[misc]
from dvc.repo.diff import diff # type: ignore[misc]
from dvc.repo.du import du as _du # type: ignore[misc]
from dvc.repo.fetch import fetch # type: ignore[misc]
from dvc.repo.freeze import freeze, unfreeze # type: ignore[misc]
from dvc.repo.gc import gc # type: ignore[misc]
Expand All @@ -93,6 +94,7 @@ class Repo:
from .cache import check_missing as cache_check_missing # type: ignore[misc]
from .data import status as data_status # type: ignore[misc]

du = staticmethod(_du)
ls = staticmethod(_ls)
ls_url = staticmethod(_ls_url)
get = staticmethod(_get)
Expand Down
42 changes: 42 additions & 0 deletions dvc/repo/du.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from typing import Any, Dict, Optional, Union


def du(
url: str,
path: Optional[str] = None,
rev: Optional[str] = None,
summarize: bool = False,
config: Union[None, Dict[str, Any], str] = None,
remote: Optional[str] = None,
remote_config: Optional[dict] = None,
):
from dvc.config import Config

from . import Repo

if config and not isinstance(config, dict):
config_dict = Config.load_file(config)
else:
config_dict = None

with Repo.open(
url,
rev=rev,
subrepos=True,
uninitialized=True,
config=config_dict,
remote=remote,
remote_config=remote_config,
) as repo:
path = path or ""

fs = repo.dvcfs

if summarize or not fs.isdir(path):
return [(path, fs.du(path, total=True))]

ret = [
(entry_path, fs.du(entry_path, total=True)) for entry_path in fs.ls(path)
]
ret.append((path, sum(entry[1] for entry in ret)))
return ret
49 changes: 49 additions & 0 deletions tests/func/test_du.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import os


def test_du(tmp_dir, dvc):
tmp_dir.gen(
{
"file": b"file",
"dvcfile": b"dvcfile",
"dir": {
"dirfile": b"dirfile",
"subdir": {
"subdirfile": b"subdirfile",
},
"dvcsubdir": {
"dvcsubdirfile": b"dvcsubdirfile",
},
},
}
)

dvc.add("dvcfile")
dvc.add(os.path.join("dir", "dvcsubdir"))

assert dvc.du(".", "file") == [("file", 4)]
assert dvc.du(".", "dvcfile") == [("dvcfile", 7)]
assert set(dvc.du(".", "dir/subdir")) == {
("dir/subdir/subdirfile", 10),
("dir/subdir", 10),
}
assert dvc.du(".", "dir/subdir", summarize=True) == [("dir/subdir", 10)]
assert set(dvc.du(".", "dir/dvcsubdir")) == {
("dir/dvcsubdir/dvcsubdirfile", 13),
("dir/dvcsubdir", 13),
}
assert dvc.du(".", "dir/dvcsubdir", summarize=True) == [("dir/dvcsubdir", 13)]
assert set(dvc.du(".", "dir")) == {
("dir/dvcsubdir", 13),
("dir/subdir", 10),
("dir/dirfile", 7),
("dir", 30),
}
assert dvc.du(".", "dir", summarize=True) == [("dir", 30)]
assert set(dvc.du(".", "/")) == {
("/dvcfile", 7),
("/dir", 30),
("/file", 4),
("/", 41),
}
assert dvc.du(".", "/", summarize=True) == [("/", 41)]
21 changes: 21 additions & 0 deletions tests/unit/command/test_du.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from dvc.cli import parse_args
from dvc.commands.du import CmdDU


def test_du(mocker):
cli_args = parse_args(["du", "myurl", "mypath", "--summarize", "--rev", "myrev"])
assert cli_args.func == CmdDU

cmd = cli_args.func(cli_args)
mock_du = mocker.patch("dvc.repo.Repo.du")

assert cmd.run() == 0
mock_du.assert_called_once_with(
"myurl",
"mypath",
rev="myrev",
summarize=True,
config=None,
remote=None,
remote_config=None,
)

0 comments on commit 8ccae0a

Please sign in to comment.