Skip to content

Commit

Permalink
tests serializing of stage to lockfile (iterative#3988)
Browse files Browse the repository at this point in the history
* tests: serialize stage for lockfile

* config: make isort/black compatible

* convert class based tests to pytest
  • Loading branch information
skshetry authored Jun 9, 2020
1 parent d201fad commit afc183f
Show file tree
Hide file tree
Showing 4 changed files with 249 additions and 35 deletions.
64 changes: 35 additions & 29 deletions dvc/stage/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,51 +63,56 @@ def _serialize_outs(outputs: List[BaseOutput]):
return outs, metrics, plots


def _serialize_params(params: List[ParamsDependency]):
"""Return two types of values from stage:
`keys` - which is list of params without values, used in a pipeline file
which is in the shape of:
['lr', 'train', {'params2.yaml': ['lr']}]
def _serialize_params_keys(params):
"""
Returns the following format of data:
['lr', 'train', {'params2.yaml': ['lr']}]
`key_vals` - which is list of params with values, used in a lockfile
which is in the shape of:
{'params.yaml': {'lr': '1', 'train': 2}, {'params2.yaml': {'lr': '1'}}
The output is sorted, with keys of params from default params file being
at the first, and then followed by entry of other files in lexicographic
order. The keys of those custom files are also sorted in the same order.
"""
keys = []
key_vals = OrderedDict()

for param_dep in sort_by_path(params):
dump = param_dep.dumpd()
path, params = dump[PARAM_PATH], dump[PARAM_PARAMS]
if isinstance(params, dict):
k = sorted(params.keys())
if not k:
continue
key_vals[path] = OrderedDict([(key, params[key]) for key in k])
else:
assert isinstance(params, list)
# no params values available here, entry will be skipped for lock
k = sorted(params)
assert isinstance(params, (dict, list))
# when on no_exec, params are not filled and are saved as list
k = sorted(params.keys() if isinstance(params, dict) else params)
if not k:
continue

# params from default file is always kept at the start of the `params:`
if path == DEFAULT_PARAMS_FILE:
keys = k + keys
if key_vals:
key_vals.move_to_end(path, last=False)
else:
# if it's not a default file, change the shape
# to: {path: k}
keys.append({path: k})
return keys, key_vals
return keys


def _serialize_params_values(params: List[ParamsDependency]):
"""Returns output of following format, used for lockfile:
{'params.yaml': {'lr': '1', 'train': 2}, {'params2.yaml': {'lr': '1'}}
Default params file are always kept at the start, followed by others in
alphabetical order. The param values are sorted too(not recursively though)
"""
key_vals = OrderedDict()
for param_dep in sort_by_path(params):
dump = param_dep.dumpd()
path, params = dump[PARAM_PATH], dump[PARAM_PARAMS]
if isinstance(params, dict):
kv = [(key, params[key]) for key in sorted(params.keys())]
key_vals[path] = OrderedDict(kv)
if path == DEFAULT_PARAMS_FILE:
key_vals.move_to_end(path, last=False)
return key_vals


def to_pipeline_file(stage: "PipelineStage"):
wdir = resolve_wdir(stage.wdir, stage.path)
params, deps = split_params_deps(stage)
deps = sorted([d.def_path for d in deps])
params, _ = _serialize_params(params)
params = _serialize_params_keys(params)

outs, metrics, plots = _serialize_outs(stage.outs)
res = [
Expand Down Expand Up @@ -143,10 +148,11 @@ def to_single_stage_lockfile(stage: "Stage") -> dict:
]
for items in [deps, stage.outs]
]
params = _serialize_params_values(params)
if deps:
res[PARAM_DEPS] = deps
if params:
_, res[PARAM_PARAMS] = _serialize_params(params)
res[PARAM_PARAMS] = params
if outs:
res[PARAM_OUTS] = outs

Expand Down
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,6 @@ include_trailing_comma=true
known_first_party=dvc,tests
known_third_party=PyInstaller,RangeHTTPServer,boto3,colorama,configobj,distro,dpath,flaky,flufl,funcy,git,google,grandalf,mock,mockssh,moto,nanotime,networkx,packaging,paramiko,pathspec,pytest,requests,ruamel,setuptools,shortuuid,tqdm,voluptuous,yaml,zc
line_length=79
force_grid_wrap=0
use_parentheses=True
multi_line_output=3
208 changes: 208 additions & 0 deletions tests/unit/stage/test_serialize_pipeline_lock.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
from collections import OrderedDict

import pytest
from voluptuous import Schema as _Schema

from dvc.dvcfile import PIPELINE_FILE
from dvc.schema import LOCK_FILE_STAGE_SCHEMA, LOCKFILE_SCHEMA
from dvc.stage import PipelineStage, create_stage
from dvc.stage.serialize import DEFAULT_PARAMS_FILE, to_lockfile
from dvc.stage.serialize import (
to_single_stage_lockfile as _to_single_stage_lockfile,
)
from dvc.stage.utils import split_params_deps

kwargs = {"name": "something", "cmd": "command", "path": PIPELINE_FILE}
Schema = _Schema(LOCK_FILE_STAGE_SCHEMA)


def to_single_stage_lockfile(stage):
"""Validate schema on each serialization."""
e = _to_single_stage_lockfile(stage)
assert Schema(e)
return e


def test_lock(dvc):
stage = create_stage(PipelineStage, dvc, **kwargs)
assert to_single_stage_lockfile(stage) == {"cmd": "command"}


def test_lock_deps(dvc):
stage = create_stage(PipelineStage, dvc, deps=["input"], **kwargs)
stage.deps[0].info = {"md5": "md-five"}
assert to_single_stage_lockfile(stage) == OrderedDict(
[
("cmd", "command"),
("deps", [OrderedDict([("path", "input"), ("md5", "md-five")])]),
]
)


def test_lock_deps_order(dvc):
stage = create_stage(
PipelineStage, dvc, deps=["input1", "input0"], **kwargs
)
stage.deps[0].info = {"md5": "md-one1"}
stage.deps[1].info = {"md5": "md-zer0"}
assert to_single_stage_lockfile(stage) == OrderedDict(
[
("cmd", "command"),
(
"deps",
[
OrderedDict([("path", "input0"), ("md5", "md-zer0")]),
OrderedDict([("path", "input1"), ("md5", "md-one1")]),
],
),
]
)


def test_lock_params(dvc):
stage = create_stage(
PipelineStage, dvc, params=["lorem.ipsum", "abc"], **kwargs
)
stage.deps[0].info = {"lorem.ipsum": {"lorem1": 1, "lorem2": 2}, "abc": 3}
assert to_single_stage_lockfile(stage)["params"][
DEFAULT_PARAMS_FILE
] == OrderedDict([("abc", 3), ("lorem.ipsum", {"lorem1": 1, "lorem2": 2})])


def test_lock_params_file_sorted(dvc):
stage = create_stage(
PipelineStage,
dvc,
params=[
"lorem.ipsum",
"abc",
{"myparams.yaml": ["foo", "foobar"]},
{"a-params-file.yaml": ["bar", "barr"]},
],
**kwargs
)
stage.deps[0].info = {"lorem.ipsum": {"lorem1": 1, "lorem2": 2}, "abc": 3}
stage.deps[1].info = {"foo": ["f", "o", "o"], "foobar": "foobar"}
stage.deps[2].info = {"bar": ["b", "a", "r"], "barr": "barr"}
assert to_single_stage_lockfile(stage)["params"] == OrderedDict(
[
(
DEFAULT_PARAMS_FILE,
OrderedDict(
[("abc", 3), ("lorem.ipsum", {"lorem1": 1, "lorem2": 2})]
),
),
(
"a-params-file.yaml",
OrderedDict([("bar", ["b", "a", "r"]), ("barr", "barr")]),
),
(
"myparams.yaml",
OrderedDict([("foo", ["f", "o", "o"]), ("foobar", "foobar")]),
),
]
)


def test_lock_params_no_values_filled(dvc):
stage = create_stage(
PipelineStage, dvc, params=["lorem.ipsum", "abc"], **kwargs
)
assert to_single_stage_lockfile(stage) == {"cmd": "command"}


@pytest.mark.parametrize("typ", ["plots", "metrics", "outs"])
def test_lock_outs(dvc, typ):
stage = create_stage(PipelineStage, dvc, **{typ: ["input"]}, **kwargs)
stage.outs[0].info = {"md5": "md-five"}
assert to_single_stage_lockfile(stage) == OrderedDict(
[
("cmd", "command"),
("outs", [OrderedDict([("path", "input"), ("md5", "md-five")])]),
]
)


@pytest.mark.parametrize("typ", ["plots", "metrics", "outs"])
def test_lock_outs_order(dvc, typ):
stage = create_stage(
PipelineStage, dvc, **{typ: ["input1", "input0"]}, **kwargs
)
stage.outs[0].info = {"md5": "md-one1"}
stage.outs[1].info = {"md5": "md-zer0"}
assert to_single_stage_lockfile(stage) == OrderedDict(
[
("cmd", "command"),
(
"outs",
[
OrderedDict([("path", "input0"), ("md5", "md-zer0")]),
OrderedDict([("path", "input1"), ("md5", "md-one1")]),
],
),
]
)


def test_dump_appropriate_checksums(dvc):
stage = create_stage(
PipelineStage, dvc, deps=["s3://dvc-temp/file"], **kwargs
)
stage.deps[0].info = {"etag": "is-it-etag", "md5": "or-md5?"}
assert to_single_stage_lockfile(stage) == OrderedDict(
[
("cmd", "command"),
(
"deps",
[
OrderedDict(
[
("path", "s3://dvc-temp/file"),
("etag", "is-it-etag"),
]
)
],
),
]
)


def test_order(dvc):
stage = create_stage(
PipelineStage,
dvc,
deps=["input"],
outs=["output"],
params=["foo-param"],
**kwargs
)
params, deps = split_params_deps(stage)

deps[0].info = {"md5": "md-five"}
params[0].info = {"foo-param": "value"}
stage.outs[0].info = {"md5": "md5-output"}

assert to_single_stage_lockfile(stage) == OrderedDict(
[
("cmd", "command"),
("deps", [{"path": "input", "md5": "md-five"}]),
("params", {"params.yaml": {"foo-param": "value"}}),
("outs", [{"path": "output", "md5": "md5-output"}]),
]
)


def test_to_lockfile(dvc):
stage = create_stage(PipelineStage, dvc, deps=["input"], **kwargs)
stage.deps[0].info = {"md5": "md-five"}
entry = to_lockfile(stage)
assert len(entry) == 1
_Schema(LOCKFILE_SCHEMA)(entry)
assert entry == {
"something": OrderedDict(
[
("cmd", "command"),
("deps", [{"path": "input", "md5": "md-five"}]),
]
)
}
10 changes: 4 additions & 6 deletions tests/unit/stage/test_stage.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import signal
import subprocess
import threading
from unittest import TestCase

import mock
import pytest
Expand Down Expand Up @@ -50,12 +49,11 @@ def test_meta_ignored():
assert stage.compute_md5() == "e9521a22111493406ea64a88cda63e0b"


class TestPathConversion(TestCase):
def test(self):
stage = Stage(None, "path")
def test_path_conversion(dvc):
stage = Stage(dvc, "path")

stage.wdir = os.path.join("..", "..")
self.assertEqual(stage.dumpd()["wdir"], "../..")
stage.wdir = os.path.join("..", "..")
assert stage.dumpd()["wdir"] == "../.."


def test_stage_update(mocker):
Expand Down

0 comments on commit afc183f

Please sign in to comment.