Skip to content

Commit

Permalink
[WIP] Pkg install (iterative#1742)
Browse files Browse the repository at this point in the history
* Download Git package

* Extract tmp git repo & fetch data

* Handle fetch\cloud errors and rename select option

* Modify gitignore files when deploy a package

* More tests for git ignore file. Fix an old issue.

* Raise and exception when .gitignor not in a file subdir

* Existing tests fixes

* Basic test for packages

* Comment out pkg tests

* Codeclimate improvements

* Reformatting

* Enable pkg tests

* Fix tests no Windows

* Another old bug fix

* Fix encoding issue on Windows

* Another windows fix: apply compat.str magic to file write

* Back to posix path in gitignore and apply compat.str magic to tests

* gitignorepath: OS specifix path to posix

* Forgotten assignment

* misc: cleanup small issues

Signed-off-by: Ruslan Kuprieiev <[email protected]>

* cli: pkg: hide from help

Signed-off-by: Ruslan Kuprieiev <[email protected]>
  • Loading branch information
dmpetrov authored and efiop committed Mar 20, 2019
1 parent a691985 commit 70bc101
Show file tree
Hide file tree
Showing 15 changed files with 739 additions and 46 deletions.
2 changes: 2 additions & 0 deletions dvc/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import dvc.logger as logger
from dvc.command.base import fix_subparsers
import dvc.command.init as init
import dvc.command.pkg as pkg
import dvc.command.destroy as destroy
import dvc.command.remove as remove
import dvc.command.move as move
Expand Down Expand Up @@ -36,6 +37,7 @@

COMMANDS = [
init,
pkg,
destroy,
add,
remove,
Expand Down
81 changes: 81 additions & 0 deletions dvc/command/pkg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from __future__ import unicode_literals

import dvc.logger as logger
from dvc.exceptions import DvcException
from dvc.command.base import CmdBase, fix_subparsers
from dvc.repo.pkg import PackageParams


class CmdPkg(CmdBase):
def run(self, unlock=False):
try:
pkg_params = PackageParams(
self.args.address,
self.args.target_dir,
self.args.select,
self.args.file,
)
return self.repo.install_pkg(pkg_params)
except DvcException:
logger.error(
"failed to install package '{}'".format(self.args.address)
)
return 1
pass


def add_parser(subparsers, parent_parser):
from dvc.command.config import parent_config_parser

PKG_HELP = "Manage packages and modules"
pkg_parser = subparsers.add_parser(
"pkg", parents=[parent_parser], description=PKG_HELP, add_help=False
)

pkg_subparsers = pkg_parser.add_subparsers(
dest="cmd", help="Use dvc pkg CMD --help for command-specific help."
)

fix_subparsers(pkg_subparsers)

PKG_INSTALL_HELP = "Install package."
pkg_install_parser = pkg_subparsers.add_parser(
"install",
parents=[parent_config_parser, parent_parser],
description=PKG_INSTALL_HELP,
help=PKG_INSTALL_HELP,
)
pkg_install_parser.add_argument(
"address",
nargs="?",
default="",
help="Package address: git://<url> or https://github.com/...",
)
pkg_install_parser.add_argument(
"target_dir",
metavar="target",
nargs="?",
default=".",
help="Target directory to deploy package outputs. "
"Default value is the current dir.",
)
pkg_install_parser.add_argument(
"-s",
"--select",
metavar="OUT",
action="append",
default=[],
help="Select and persist only specified outputs from a package. "
"The parameter can be used multiple times. "
"All outputs will be selected by default.",
)
pkg_install_parser.add_argument(
"-f",
"--file",
help="Specify name of the stage file. It should be "
"either 'Dvcfile' or have a '.dvc' suffix (e.g. "
"'prepare.dvc', 'clean.dvc', etc). "
"By default the file has 'mod_' prefix and imported package name "
"followed by .dvc",
)
pkg_install_parser.set_defaults(func=CmdPkg)
5 changes: 5 additions & 0 deletions dvc/output/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ def is_local(self):
"is local is not supported for {}".format(self.scheme)
)

def assign_to_stage_file(self, target_repo):
raise DvcException(
"change repo is not supported for {}".format(self.scheme)
)

@classmethod
def match(cls, url):
return re.match(cls.REMOTE.REGEX, url)
Expand Down
15 changes: 15 additions & 0 deletions dvc/output/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,17 @@ def is_local(self):
self.url
)

def assign_to_stage_file(self, stage):
from dvc.repo import Repo

fullpath = os.path.abspath(stage.wdir)
self.path_info["path"] = os.path.join(fullpath, self.stage_path)

self.repo = Repo(self.path)

self.stage = stage
return self

@property
def sep(self):
return os.sep
Expand All @@ -50,6 +61,10 @@ def sep(self):
def rel_path(self):
return os.path.relpath(self.path)

@property
def stage_path(self):
return os.path.relpath(self.path, self.stage.wdir)

@property
def cache(self):
return self.repo.cache.local.get(self.checksum)
Expand Down
1 change: 1 addition & 0 deletions dvc/repo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class Repo(object):
from dvc.repo.status import status
from dvc.repo.gc import gc
from dvc.repo.commit import commit
from dvc.repo.pkg import install_pkg

def __init__(self, root_dir=None):
from dvc.config import Config
Expand Down
201 changes: 201 additions & 0 deletions dvc/repo/pkg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
from __future__ import unicode_literals

import os
import shutil
from git.cmd import Git

import dvc.logger as logger
from dvc.exceptions import DvcException
from dvc.stage import Stage
from dvc.temp_git_repo import TempGitRepo


class PackageParams(object):
def __init__(self, address, target_dir, select=[], file=None):
self.address = address
self.target_dir = target_dir
self.select = select
self.file = file

@property
def all_addresses(self):
if self.address:
return [self.address]
return PackageManager.read_packages()


class PackageManager(object):
PACKAGE_FILE = "package.yaml"

@staticmethod
def read_packages():
return []

@staticmethod
def get_package(addr):
for pkg_class in [GitPackage]:
try:
return pkg_class()
except Exception:
pass
return None

def __init__(self, addr):
self._addr = addr


class Package(object):
MODULES_DIR = "dvc_mod"

def install_or_update(self, parent_repo, pkg_param):
raise NotImplementedError(
"A method of abstract Package class was called"
)

def is_in_root(self):
return True


class GitPackage(Package):
DEF_DVC_FILE_PREFIX = "mod_"

def install_or_update(self, parent_repo, pkg_params):
if not self.is_in_root():
raise DvcException(
"This command can be run only from a repository root"
)

if not os.path.exists(self.MODULES_DIR):
logger.debug("Creating modules dir {}".format(self.MODULES_DIR))
os.makedirs(self.MODULES_DIR)
parent_repo.scm.ignore(os.path.abspath(self.MODULES_DIR))

module_name = (
Git.polish_url(pkg_params.address).strip("/").split("/")[-1]
)
if not module_name:
raise DvcException(
"Package address error: unable to extract package name"
)

with TempGitRepo(
pkg_params.address, module_name, Package.MODULES_DIR
) as tmp_repo:
outputs_to_copy = tmp_repo.outs
if pkg_params.select:
outputs_to_copy = list(
filter(
lambda out: out.dvc_path in pkg_params.select,
outputs_to_copy,
)
)

fetched_stage_files = set(
map(lambda o: o.stage.path, outputs_to_copy)
)
tmp_repo.fetch(fetched_stage_files)

module_dir = self.create_module_dir(module_name)
tmp_repo.persist_to(module_dir, parent_repo)

dvc_file = self.get_dvc_file_name(
pkg_params.file, pkg_params.target_dir, module_name
)
try:
self.persist_stage_and_scm_state(
parent_repo,
outputs_to_copy,
pkg_params.target_dir,
dvc_file,
)
except Exception as ex:
raise DvcException(
"Package '{}' was installed "
"but stage file '{}' "
"was not created properly: {}".format(
pkg_params.address, dvc_file, ex
)
)

parent_repo.checkout(dvc_file)

@staticmethod
def persist_stage_and_scm_state(
parent_repo, outputs_to_copy, target_dir, dvc_file
):
stage = Stage.create(
repo=parent_repo,
fname=dvc_file,
validate_state=False,
wdir=target_dir,
)
stage.outs = list(
map(lambda o: o.assign_to_stage_file(stage), outputs_to_copy)
)

for out in stage.outs:
parent_repo.scm.ignore(out.path, in_curr_dir=True)

stage.dump()

@staticmethod
def create_module_dir(module_name):
module_dir = os.path.join(GitPackage.MODULES_DIR, module_name)
if os.path.exists(module_dir):
logger.info("Updating package {}".format(module_name))
shutil.rmtree(module_dir)
else:
logger.info("Adding package {}".format(module_name))
return module_dir

def get_dvc_file_name(self, stage_file, target_dir, module_name):
if stage_file:
dvc_file_path = stage_file
else:
dvc_file_name = self.DEF_DVC_FILE_PREFIX + module_name + ".dvc"
dvc_file_path = os.path.join(target_dir, dvc_file_name)
return dvc_file_path


def install_pkg(self, pkg_params):
"""
Install package.
The command can be run only from DVC project root.
E.g.
Having: DVC package in https://github.com/dmpetrov/tag_classifier
$ dvc pkg install https://github.com/dmpetrov/tag_classifier
Result: tag_classifier package in dvc_mod/ directory
"""

if not os.path.isdir(pkg_params.target_dir):
logger.error(
"Unable to install package: "
"target directory '{}' does not exist".format(
pkg_params.target_dir
)
)
return 1

curr_dir = os.path.realpath(os.curdir)
if not os.path.realpath(pkg_params.target_dir).startswith(curr_dir):
logger.error(
"Unable to install package: the current dir should be"
" a subdirectory of the target dir {}".format(
pkg_params.target_dir
)
)
return 1

for addr in pkg_params.all_addresses:
try:
mgr = PackageManager.get_package(addr)
mgr.install_or_update(self, pkg_params)
except Exception as ex:
logger.error("Unable to install package: ".format(ex))
return 1

return 0
7 changes: 6 additions & 1 deletion dvc/scm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,17 @@ class FileNotInRepoError(DvcException):
"""


class FileNotInTargetSubdirError(DvcException):
"""Thrown when trying to place .gitignore for a file that not in
the file subdirectory."""


class Base(object):
"""Base class for source control management driver implementations."""

def __init__(self, root_dir=os.curdir, repo=None):
self.repo = repo
self.root_dir = root_dir
self.root_dir = os.path.realpath(root_dir)

def __repr__(self):
return "{class_name}: '{directory}'".format(
Expand Down
Loading

0 comments on commit 70bc101

Please sign in to comment.