Skip to content

Commit

Permalink
dvc check-ignore command (iterative#4282)
Browse files Browse the repository at this point in the history
* Update some tests first

fix iterative#3736

* first edition

* solve failure in Windows

* For Windows ci

* Some help ducuments issue.

* Update dvc/ignore.py

abspath

Co-authored-by: Ruslan Kuprieiev <[email protected]>

* Refactor with OutOfWorkSpaceError

* Solve a bug

* Update dvc/command/check_ignore.py

Co-authored-by: Jorge Orpinel <[email protected]>

* Update dvc/command/check_ignore.py

Co-authored-by: Jorge Orpinel <[email protected]>

* Update dvc/command/check_ignore.py

* Revert "Refactor with OutOfWorkSpaceError"

This reverts commit 27eec49.

* Two change request

1. Argument `targets`'s description.
2. Error handling of `_get_normalize_path`

* Update dvc/main.py

Co-authored-by: Ruslan Kuprieiev <[email protected]>

* `check_ignore` now only accept one path a time

1. Add a new test for the out side repo cases
2. check ignore now only check one file not file lists

Co-authored-by: Ruslan Kuprieiev <[email protected]>
Co-authored-by: karajan1001 <[email protected]>
Co-authored-by: Jorge Orpinel <[email protected]>
  • Loading branch information
4 people authored Aug 3, 2020
1 parent b823ce4 commit 11d906e
Show file tree
Hide file tree
Showing 7 changed files with 293 additions and 24 deletions.
2 changes: 2 additions & 0 deletions dvc/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .command import (
add,
cache,
check_ignore,
checkout,
commit,
completion,
Expand Down Expand Up @@ -79,6 +80,7 @@
git_hook,
plots,
experiments,
check_ignore,
]


Expand Down
70 changes: 70 additions & 0 deletions dvc/command/check_ignore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import argparse
import logging

from dvc.command import completion
from dvc.command.base import CmdBase, append_doc_link
from dvc.exceptions import DvcException

logger = logging.getLogger(__name__)


class CmdCheckIgnore(CmdBase):
def __init__(self, args):
super().__init__(args)
self.ignore_filter = self.repo.tree.dvcignore

def _show_results(self, result):
if result.match or self.args.non_matching:
if self.args.details:
logger.info("{}\t{}".format(result.patterns[-1], result.file))
else:
logger.info(result.file)

def run(self):
if self.args.non_matching and not self.args.details:
raise DvcException("--non-matching is only valid with --details")

if self.args.quiet and self.args.details:
raise DvcException("cannot both --details and --quiet")

ret = 1
for target in self.args.targets:
result = self.ignore_filter.check_ignore(target)
self._show_results(result)
if result.match:
ret = 0
return ret


def add_parser(subparsers, parent_parser):
ADD_HELP = "Debug DVC ignore/exclude files"

parser = subparsers.add_parser(
"check-ignore",
parents=[parent_parser],
description=append_doc_link(ADD_HELP, "check-ignore"),
help=ADD_HELP,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"-d",
"--details",
action="store_true",
default=False,
help="Show the exclude pattern together with each target path.",
)
parser.add_argument(
"-n",
"--non-matching",
action="store_true",
default=False,
help="Show the target paths which don’t match any pattern. "
"Only usable when `--details` is also employed",
)
parser.add_argument(
"targets",
nargs="+",
help="Exact or wildcard paths of files or directories to check "
"ignore patterns.",
).complete = completion.FILE
parser.set_defaults(func=CmdCheckIgnore)
97 changes: 83 additions & 14 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import logging
import os
import re
from collections import namedtuple
from itertools import groupby, takewhile

from pathspec.patterns import GitWildMatchPattern
from pathspec.util import normalize_file
from pygtrie import StringTrie

from dvc.path_info import PathInfo
from dvc.pathspec_math import merge_patterns
from dvc.pathspec_math import PatternInfo, merge_patterns
from dvc.system import System
from dvc.utils import relpath

Expand All @@ -24,28 +25,46 @@ def __call__(self, root, dirs, files):

class DvcIgnorePatterns(DvcIgnore):
def __init__(self, pattern_list, dirname):
if pattern_list:
if isinstance(pattern_list[0], str):
pattern_list = [
PatternInfo(pattern, "") for pattern in pattern_list
]

self.pattern_list = pattern_list
self.dirname = dirname
self.prefix = self.dirname + os.sep

regex_pattern_list = map(
GitWildMatchPattern.pattern_to_regex, pattern_list
)
self.regex_pattern_list = [
GitWildMatchPattern.pattern_to_regex(pattern_info.patterns)
for pattern_info in pattern_list
]

self.ignore_spec = [
(ignore, re.compile("|".join(item[0] for item in group)))
for ignore, group in groupby(regex_pattern_list, lambda x: x[1])
for ignore, group in groupby(
self.regex_pattern_list, lambda x: x[1]
)
if ignore is not None
]

@classmethod
def from_files(cls, ignore_file_path, tree):
assert os.path.isabs(ignore_file_path)
dirname = os.path.normpath(os.path.dirname(ignore_file_path))
ignore_file_rel_path = os.path.relpath(
ignore_file_path, tree.tree_root
)
with tree.open(ignore_file_path, encoding="utf-8") as fobj:
path_spec_lines = [
line for line in map(str.strip, fobj.readlines()) if line
PatternInfo(
line,
"{}:{}:{}".format(ignore_file_rel_path, line_no + 1, line),
)
for line_no, line in enumerate(
map(str.strip, fobj.readlines())
)
if line
]

return cls(path_spec_lines, dirname)
Expand All @@ -56,7 +75,7 @@ def __call__(self, root, dirs, files):

return dirs, files

def matches(self, dirname, basename, is_dir=False):
def _get_normalize_path(self, dirname, basename):
# NOTE: `relpath` is too slow, so we have to assume that both
# `dirname` and `self.dirname` are relative or absolute together.
if dirname == self.dirname:
Expand All @@ -70,6 +89,12 @@ def matches(self, dirname, basename, is_dir=False):

if not System.is_unix():
path = normalize_file(path)
return path

def matches(self, dirname, basename, is_dir=False):
path = self._get_normalize_path(dirname, basename)
if not path:
return False
return self.ignore(path, is_dir)

def ignore(self, path, is_dir):
Expand All @@ -85,20 +110,48 @@ def ignore(self, path, is_dir):
result = ignore
return result

def match_details(self, dirname, basename, is_dir=False):
path = self._get_normalize_path(dirname, basename)
if not path:
return False
return self._ignore_details(path, is_dir)

def _ignore_details(self, path, is_dir):
result = []
for ignore, pattern in zip(self.regex_pattern_list, self.pattern_list):
regex = re.compile(ignore[0])
# skip system pattern
if not pattern.file_info:
continue
if is_dir:
path_dir = f"{path}/"
if regex.match(path) or regex.match(path_dir):
result.append(pattern.file_info)
else:
if regex.match(path):
result.append(pattern.file_info)
return result

def __hash__(self):
return hash(self.dirname + ":" + "\n".join(self.pattern_list))
return hash(self.dirname + ":" + str(self.pattern_list))

def __eq__(self, other):
if not isinstance(other, DvcIgnorePatterns):
return NotImplemented
return (self.dirname == other.dirname) & (
self.pattern_list == other.pattern_list
[pattern.patterns for pattern in self.pattern_list]
== [pattern.patterns for pattern in other.pattern_list]
)

def __bool__(self):
return bool(self.pattern_list)


CheckIgnoreResult = namedtuple(
"CheckIgnoreResult", ["file", "match", "patterns"]
)


class DvcIgnoreFilterNoop:
def __init__(self, tree, root_dir):
pass
Expand All @@ -112,6 +165,9 @@ def is_ignored_dir(self, _):
def is_ignored_file(self, _):
return False

def check_ignore(self, _):
return []


class DvcIgnoreFilter:
@staticmethod
Expand Down Expand Up @@ -166,20 +222,19 @@ def _update(self, dirname):
def _update_sub_repo(self, root, dirs):
for d in dirs:
if self._is_dvc_repo(root, d):
new_pattern = DvcIgnorePatterns(["/{}/".format(d)], root)
old_pattern = self.ignores_trie_tree.longest_prefix(root).value
if old_pattern:
self.ignores_trie_tree[root] = DvcIgnorePatterns(
*merge_patterns(
old_pattern.pattern_list,
old_pattern.dirname,
["/{}/".format(d)],
root,
new_pattern.pattern_list,
new_pattern.dirname,
)
)
else:
self.ignores_trie_tree[root] = DvcIgnorePatterns(
["/{}/".format(d)], root
)
self.ignores_trie_tree[root] = new_pattern

def __call__(self, root, dirs, files):
ignore_pattern = self._get_trie_pattern(root)
Expand Down Expand Up @@ -245,3 +300,17 @@ def _outside_repo(self, path):
):
return True
return False

def check_ignore(self, target):
full_target = os.path.abspath(target)
if not self._outside_repo(full_target):
dirname, basename = os.path.split(os.path.normpath(full_target))
pattern = self._get_trie_pattern(dirname)
if pattern:
matches = pattern.match_details(
dirname, basename, os.path.isdir(full_target)
)

if matches:
return CheckIgnoreResult(target, True, matches)
return CheckIgnoreResult(target, False, ["::"])
8 changes: 7 additions & 1 deletion dvc/pathspec_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
# of two path specification patterns with different base
# All the operations follow the documents of `gitignore`
import os
from collections import namedtuple

from pathspec.util import normalize_file

from dvc.utils import relpath

PatternInfo = namedtuple("PatternInfo", ["patterns", "file_info"])


def _not_ignore(rule):
return (True, rule[1:]) if rule.startswith("!") else (False, rule)
Expand Down Expand Up @@ -59,7 +62,10 @@ def _change_dirname(dirname, pattern_list, new_dirname):
if rel.startswith(".."):
raise ValueError("change dirname can only change to parent path")

return [change_rule(rule, rel) for rule in pattern_list]
return [
PatternInfo(change_rule(rule.patterns, rel), rule.file_info)
for rule in pattern_list
]


def merge_patterns(pattern_a, prefix_a, pattern_b, prefix_b):
Expand Down
Loading

0 comments on commit 11d906e

Please sign in to comment.