Skip to content

Commit

Permalink
dvc: Use "DVC-file" term as much as possible
Browse files Browse the repository at this point in the history
Keep "stage file" in the context of pipelines

No issue
  • Loading branch information
jorgeorpinel committed Jun 26, 2019
1 parent aa5e8c7 commit fba7d85
Show file tree
Hide file tree
Showing 9 changed files with 195 additions and 22 deletions.
6 changes: 3 additions & 3 deletions dvc/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class OutputNotFoundError(DvcException):

def __init__(self, output):
super(OutputNotFoundError, self).__init__(
"unable to find stage file with output '{path}'".format(
"unable to find DVC-file with output '{path}'".format(
path=relpath(output)
)
)
Expand All @@ -74,7 +74,7 @@ class StagePathAsOutputError(DvcException):
Args:
cwd (str): path to the directory.
fname (str): path to the stage file that has cwd specified as an
fname (str): path to the DVC-file that has cwd specified as an
output.
"""

Expand Down Expand Up @@ -216,7 +216,7 @@ class StageFileCorruptedError(DvcException):
def __init__(self, path, cause=None):
path = relpath(path)
super(StageFileCorruptedError, self).__init__(
"unable to read stage file: {} "
"unable to read DVC-file: {} "
"YAML file structure is corrupted".format(path),
cause=cause,
)
Expand Down
2 changes: 1 addition & 1 deletion dvc/repo/add.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def add(repo, target, recursive=False, no_commit=False, fname=None):
logger.warning(
"You are adding a large directory '{target}' recursively,"
" consider tracking it as a whole instead.\n"
"{purple}HINT:{nc} Remove the generated stage files and then"
"{purple}HINT:{nc} Remove the generated DVC-file and then"
" run {cyan}dvc add {target}{nc}".format(
purple=colorama.Fore.MAGENTA,
cyan=colorama.Fore.CYAN,
Expand Down
2 changes: 1 addition & 1 deletion dvc/repo/metrics/show.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def _collect_metrics(repo, path, recursive, typ, xpath, branch):
outs = repo.find_outs_by_path(path, outs=outs, recursive=recursive)
except OutputNotFoundError:
logger.debug(
"stage file not for found for '{}' in branch '{}'".format(
"DVC-file not for found for '{}' in branch '{}'".format(
path, branch
)
)
Expand Down
2 changes: 1 addition & 1 deletion dvc/repo/move.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def move(self, from_path, to_path):
to reflect the change on the pipeline.
If the output has the same name as its stage, it would
also rename the corresponding stage file.
also rename the corresponding DVC-file.
E.g.
Having: (hello, hello.dvc)
Expand Down
173 changes: 173 additions & 0 deletions dvc/repo/pkg/install.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
from __future__ import unicode_literals

import os
import shutil
import logging

from dvc.exceptions import DvcException
from dvc.stage import Stage
from dvc.scm.git.temp_git_repo import TempGitRepo


logger = logging.getLogger(__name__)


class PackageManager(object):
PACKAGE_FILE = "package.yaml"

@staticmethod
def read_packages():
return []

@staticmethod
def get_package(addr):
for pkg_class in [GitPackage]:
try:
return pkg_class()
except Exception:
pass
return None

def __init__(self, addr):
self._addr = addr


class Package(object):
MODULES_DIR = "dvc_mod"

def install_or_update(
self, parent_repo, address, target_dir, select=[], fname=None
):
raise NotImplementedError(
"A method of abstract Package class was called"
)

def is_in_root(self):
return True


class GitPackage(Package):
DEF_DVC_FILE_PREFIX = "mod_"

def install_or_update(
self, parent_repo, address, target_dir, select=[], fname=None
):
from git.cmd import Git

if not self.is_in_root():
raise DvcException(
"This command can be run only from a repository root"
)

if not os.path.exists(self.MODULES_DIR):
logger.debug("Creating modules dir {}".format(self.MODULES_DIR))
os.makedirs(self.MODULES_DIR)
parent_repo.scm.ignore(os.path.abspath(self.MODULES_DIR))

module_name = Git.polish_url(address).strip("/").split("/")[-1]
if not module_name:
raise DvcException(
"Package address error: unable to extract package name"
)

with TempGitRepo(
address, module_name, Package.MODULES_DIR
) as tmp_repo:
outputs_to_copy = tmp_repo.outs
if select:
outputs_to_copy = list(
filter(lambda out: out.dvc_path in select, outputs_to_copy)
)

fetched_stage_files = set(
map(lambda o: o.stage.path, outputs_to_copy)
)
tmp_repo.fetch(fetched_stage_files)

module_dir = self.create_module_dir(module_name)
tmp_repo.persist_to(module_dir, parent_repo)

dvc_file = self.get_dvc_file_name(fname, target_dir, module_name)
try:
self.persist_stage_and_scm_state(
parent_repo, outputs_to_copy, target_dir, dvc_file
)
except Exception as ex:
raise DvcException(
"Package '{}' was installed "
"but DVC-file '{}' "
"was not created properly: {}".format(
address, dvc_file, ex
)
)

parent_repo.checkout(dvc_file)

@staticmethod
def persist_stage_and_scm_state(
parent_repo, outputs_to_copy, target_dir, dvc_file
):
stage = Stage.create(
repo=parent_repo,
fname=dvc_file,
validate_state=False,
wdir=target_dir,
)
stage.outs = list(
map(lambda o: o.assign_to_stage_file(stage), outputs_to_copy)
)

for out in stage.outs:
parent_repo.scm.ignore(out.fspath, in_curr_dir=True)

stage.dump()

@staticmethod
def create_module_dir(module_name):
module_dir = os.path.join(GitPackage.MODULES_DIR, module_name)
if os.path.exists(module_dir):
logger.info("Updating package {}".format(module_name))
shutil.rmtree(module_dir)
else:
logger.info("Adding package {}".format(module_name))
return module_dir

def get_dvc_file_name(self, stage_file, target_dir, module_name):
if stage_file:
dvc_file_path = stage_file
else:
dvc_file_name = self.DEF_DVC_FILE_PREFIX + module_name + ".dvc"
dvc_file_path = os.path.join(target_dir, dvc_file_name)
return dvc_file_path


def install(self, address, target_dir, select=[], fname=None):
"""
Install package.
The command can be run only from DVC project root.
E.g.
Having: DVC package in https://github.com/dmpetrov/tag_classifier
$ dvc pkg install https://github.com/dmpetrov/tag_classifier
Result: tag_classifier package in dvc_mod/ directory
"""

if not os.path.isdir(target_dir):
raise DvcException(
"target directory '{}' does not exist".format(target_dir)
)

curr_dir = os.path.realpath(os.curdir)
if not os.path.realpath(target_dir).startswith(curr_dir):
raise DvcException(
"the current directory should be a subdirectory of the target "
"dir '{}'".format(target_dir)
)

addresses = [address] if address else PackageManager.read_packages()
for addr in addresses:
mgr = PackageManager.get_package(addr)
mgr.install_or_update(self, address, target_dir, select, fname)
20 changes: 10 additions & 10 deletions dvc/stage.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __init__(self, stage):

class StageFileFormatError(DvcException):
def __init__(self, fname, e):
msg = "stage file '{}' format error: {}".format(fname, str(e))
msg = "DVC-file '{}' format error: {}".format(fname, str(e))
super(StageFileFormatError, self).__init__(msg)


Expand Down Expand Up @@ -183,7 +183,7 @@ def relpath(self):

@property
def is_data_source(self):
"""Whether the stage file was created with `dvc add` or `dvc import`"""
"""Whether the DVC-file was created with `dvc add` or `dvc import`"""
return self.cmd is None

@staticmethod
Expand All @@ -206,14 +206,14 @@ def changed_md5(self):
@property
def is_callback(self):
"""
A callback stage is always considered as changed,
An orphan stage is always considered as changed,
so it runs on every `dvc repro` call.
"""
return not self.is_data_source and len(self.deps) == 0

@property
def is_import(self):
"""Whether the stage file was created with `dvc import`."""
"""Whether the DVC-file was created with `dvc import`."""
return not self.cmd and len(self.deps) == 1 and len(self.outs) == 1

@property
Expand All @@ -229,7 +229,7 @@ def _changed_deps(self):

if self.is_callback:
logger.warning(
"DVC-file '{fname}' is a 'callback' stage "
"DVC-file '{fname}' is an 'orphan' stage "
"(has a command and no dependencies) and thus always "
"considered as changed.".format(fname=self.relpath)
)
Expand Down Expand Up @@ -453,9 +453,9 @@ def create(
if wdir is None and cwd is not None:
if fname is not None and os.path.basename(fname) != fname:
raise StageFileBadNameError(
"stage file name '{fname}' may not contain subdirectories"
"DVC-file name '{fname}' may not contain subdirectories"
" if '-c|--cwd' (deprecated) is specified. Use '-w|--wdir'"
" along with '-f' to specify stage file path and working"
" along with '-f' to specify DVC-file path with working"
" directory.".format(fname=fname)
)
wdir = cwd
Expand Down Expand Up @@ -559,7 +559,7 @@ def _fill_stage_outputs(
def _check_dvc_filename(fname):
if not Stage.is_valid_filename(fname):
raise StageFileBadNameError(
"bad stage filename '{}'. Stage files should be named"
"bad DVC-file name '{}'. DVC-files should be named"
" 'Dvcfile' or have a '.dvc' suffix (e.g. '{}.dvc').".format(
relpath(fname), os.path.basename(fname)
)
Expand Down Expand Up @@ -663,10 +663,10 @@ def _compute_md5(self):
if self.PARAM_MD5 in d.keys():
del d[self.PARAM_MD5]

# Ignore the wdir default value. In this case stage file w/o
# Ignore the wdir default value. In this case DVC-file w/o
# wdir has the same md5 as a file with the default value specified.
# It's important for backward compatibility with pipelines that
# didn't have WDIR in their stage files.
# didn't have WDIR in their DVC-files.
if d.get(self.PARAM_WDIR) == ".":
del d[self.PARAM_WDIR]

Expand Down
8 changes: 4 additions & 4 deletions tests/func/test_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def test_warn_about_large_directories(self):
warning = (
"You are adding a large directory 'large-dir' recursively,"
" consider tracking it as a whole instead.\n"
"{purple}HINT:{nc} Remove the generated stage files and then"
"{purple}HINT:{nc} Remove the generated DVC-files and then"
" run {cyan}dvc add large-dir{nc}".format(
purple=colorama.Fore.MAGENTA,
cyan=colorama.Fore.CYAN,
Expand Down Expand Up @@ -424,7 +424,7 @@ def test(self):

foo_stage = relpath(self.FOO + Stage.STAGE_FILE_SUFFIX)

# corrupt stage file
# corrupt DVC-file
with open(foo_stage, "a+") as file:
file.write("this will break yaml file structure")

Expand All @@ -434,7 +434,7 @@ def test(self):
assert 1 == ret

expected_error = (
"unable to read stage file: {} "
"unable to read DVC-file: {} "
"YAML file structure is corrupted".format(foo_stage)
)

Expand Down Expand Up @@ -475,7 +475,7 @@ def test(self):
self.assertEqual(0, ret)

foo_stage_file = self.FOO + Stage.STAGE_FILE_SUFFIX
# corrupt stage file
# corrupt DVC-file
with open(foo_stage_file, "a+") as file:
file.write("this will break yaml file structure")

Expand Down
2 changes: 1 addition & 1 deletion tests/func/test_repro.py
Original file line number Diff line number Diff line change
Expand Up @@ -1558,7 +1558,7 @@ def test_dvc_formatting_retained(dvc_repo, foo_copy):
root = pathlib.Path(dvc_repo.root_dir)
stage_file = root / foo_copy["stage_fname"]

# Add comments and custom formatting to stage file
# Add comments and custom formatting to DVC-file
lines = list(map(_format_dvc_line, stage_file.read_text().splitlines()))
lines.insert(0, "# Starting comment")
stage_text = "".join(l + "\n" for l in lines)
Expand Down
2 changes: 1 addition & 1 deletion tests/func/test_stage.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def test_md5_ignores_comments(repo_dir, dvc_repo):
def test_meta_is_preserved(dvc_repo):
stage, = dvc_repo.add("foo")

# Add meta to stage file
# Add meta to DVC-file
data = load_stage_file(stage.path)
data["meta"] = {"custom_key": 42}
dump_stage_file(stage.path, data)
Expand Down

0 comments on commit fba7d85

Please sign in to comment.