Skip to content

Commit

Permalink
🎉 Add --word-list option
Browse files Browse the repository at this point in the history
- Add `pyahocorasick` as an optional dependency

See issue Yelp#240 for more information.
  • Loading branch information
KevinHock committed Sep 19, 2019
1 parent 0b2c0e1 commit f8cb31f
Show file tree
Hide file tree
Showing 22 changed files with 425 additions and 76 deletions.
14 changes: 13 additions & 1 deletion detect_secrets/core/baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,27 @@ def initialize(
plugins,
exclude_files_regex=None,
exclude_lines_regex=None,
word_list_file=None,
word_list_hash=None,
should_scan_all_files=False,
):
"""Scans the entire codebase for secrets, and returns a
SecretsCollection object.
:type path: list
:type plugins: tuple of detect_secrets.plugins.base.BasePlugin
:param plugins: rules to initialize the SecretsCollection with.
:type exclude_files_regex: str|None
:type exclude_lines_regex: str|None
:type path: list
:type word_list_file: str|None
:param word_list_file: optional word list file for ignoring certain words.
:type word_list_hash: str|None
:param word_list_hash: optional iterated sha1 hash of the words in the word list.
:type should_scan_all_files: bool
:rtype: SecretsCollection
Expand All @@ -37,6 +47,8 @@ def initialize(
plugins,
exclude_files=exclude_files_regex,
exclude_lines=exclude_lines_regex,
word_list_file=word_list_file,
word_list_hash=word_list_hash,
)

files_to_scan = []
Expand Down
29 changes: 26 additions & 3 deletions detect_secrets/core/secrets_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from detect_secrets.core.log import log
from detect_secrets.core.potential_secret import PotentialSecret
from detect_secrets.plugins.common import initialize
from detect_secrets.util import build_automaton


class SecretsCollection(object):
Expand All @@ -21,6 +22,8 @@ def __init__(
plugins=(),
exclude_files=None,
exclude_lines=None,
word_list_file=None,
word_list_hash=None,
):
"""
:type plugins: tuple of detect_secrets.plugins.base.BasePlugin
Expand All @@ -32,14 +35,18 @@ def __init__(
:type exclude_lines: str|None
:param exclude_lines: optional regex for ignored lines.
:type version: str
:param version: version of detect-secrets that SecretsCollection
is valid at.
:type word_list_file: str|None
:param word_list_file: optional word list file for ignoring certain words.
:type word_list_hash: str|None
:param word_list_hash: optional iterated sha1 hash of the words in the word list.
"""
self.data = {}
self.plugins = plugins
self.exclude_files = exclude_files
self.exclude_lines = exclude_lines
self.word_list_file = word_list_file
self.word_list_hash = word_list_hash
self.version = VERSION

@classmethod
Expand Down Expand Up @@ -93,13 +100,25 @@ def load_baseline_from_dict(cls, data):
result.exclude_files = data['exclude']['files']
result.exclude_lines = data['exclude']['lines']

# In v0.12.7 the `--word-list` option got added
automaton = None
if 'word_list' in data:
result.word_list_file = data['word_list']['file']
result.word_list_hash = data['word_list']['hash']

if result.word_list_file:
# Always ignore the given `data['word_list']['hash']`
# The difference will show whenever the word list changes
automaton, result.word_list_hash = build_automaton(result.word_list_file)

plugins = []
for plugin in data['plugins_used']:
plugin_classname = plugin.pop('name')
plugins.append(
initialize.from_plugin_classname(
plugin_classname,
exclude_lines_regex=result.exclude_lines,
automaton=automaton,
should_verify_secrets=False,
**plugin
),
Expand Down Expand Up @@ -277,6 +296,10 @@ def format_for_baseline_output(self):
'files': self.exclude_files,
'lines': self.exclude_lines,
},
'word_list': {
'file': self.word_list_file,
'hash': self.word_list_hash,
},
'plugins_used': plugins_used,
'results': results,
'version': self.version,
Expand Down
22 changes: 20 additions & 2 deletions detect_secrets/core/usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,18 @@ def add_exclude_lines_argument(parser):
)


def add_word_list_argument(parser):
parser.add_argument(
'--word-list',
type=str,
help=(
'Text file with a list of words, '
'if a secret contains a word in the list we ignore it.'
),
dest='word_list_file',
)


def add_use_all_plugins_argument(parser):
parser.add_argument(
'--use-all-plugins',
Expand Down Expand Up @@ -46,6 +58,7 @@ def add_pre_commit_arguments(self):
self._add_filenames_argument()\
._add_set_baseline_argument()\
._add_exclude_lines_argument()\
._add_word_list_argument()\
._add_use_all_plugins_argument()\
._add_no_verify_flag()

Expand Down Expand Up @@ -108,6 +121,10 @@ def _add_exclude_lines_argument(self):
add_exclude_lines_argument(self.parser)
return self

def _add_word_list_argument(self):
add_word_list_argument(self.parser)
return self

def _add_use_all_plugins_argument(self):
add_use_all_plugins_argument(self.parser)
return self
Expand Down Expand Up @@ -143,9 +160,10 @@ def _add_initialize_baseline_argument(self):
),
)

# Pairing `--exclude-lines` to both pre-commit and `--scan`
# because it can be used for both.
# Pairing `--exclude-lines` and `--word-list` to
# both pre-commit and `--scan` because it can be used for both.
add_exclude_lines_argument(self.parser)
add_word_list_argument(self.parser)

# Pairing `--exclude-files` with `--scan` because it's only used for the initialization.
# The pre-commit hook framework already has an `exclude` option that can
Expand Down
38 changes: 33 additions & 5 deletions detect_secrets/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from detect_secrets.core.secrets_collection import SecretsCollection
from detect_secrets.core.usage import ParserBuilder
from detect_secrets.plugins.common import initialize
from detect_secrets.util import build_automaton


def parse_args(argv):
Expand All @@ -30,11 +31,17 @@ def main(argv=None):
log.set_debug_level(args.verbose)

if args.action == 'scan':
automaton = None
word_list_hash = None
if args.word_list_file:
automaton, word_list_hash = build_automaton(args.word_list_file)

# Plugins are *always* rescanned with fresh settings, because
# we want to get the latest updates.
plugins = initialize.from_parser_builder(
args.plugins,
exclude_lines_regex=args.exclude_lines,
automaton=automaton,
should_verify_secrets=not args.no_verify,
)
if args.string:
Expand All @@ -46,7 +53,12 @@ def main(argv=None):
_scan_string(line, plugins)

else:
baseline_dict = _perform_scan(args, plugins)
baseline_dict = _perform_scan(
args,
plugins,
automaton,
word_list_hash,
)

if args.import_filename:
write_baseline_to_file(
Expand Down Expand Up @@ -87,7 +99,7 @@ def main(argv=None):
return 0


def _get_plugin_from_baseline(old_baseline):
def _get_plugins_from_baseline(old_baseline):
plugins = []
if old_baseline and 'plugins_used' in old_baseline:
secrets_collection = SecretsCollection.load_baseline_from_dict(old_baseline)
Expand All @@ -114,17 +126,25 @@ def _scan_string(line, plugins):
print('\n'.join(sorted(output)))


def _perform_scan(args, plugins):
def _perform_scan(args, plugins, automaton, word_list_hash):
"""
:param args: output of `argparse.ArgumentParser.parse_args`
:param plugins: tuple of initialized plugins
:type automaton: ahocorasick.Automaton|None
:param automaton: optional automaton for ignoring certain words.
:type word_list_hash: str|None
:param word_list_hash: optional iterated sha1 hash of the words in the word list.
:rtype: dict
"""
old_baseline = _get_existing_baseline(args.import_filename)
if old_baseline:
plugins = initialize.merge_plugin_from_baseline(
_get_plugin_from_baseline(old_baseline), args,
plugins = initialize.merge_plugins_from_baseline(
_get_plugins_from_baseline(old_baseline),
args,
automaton=automaton,
)

# Favors `--exclude-files` and `--exclude-lines` CLI arguments
Expand All @@ -139,6 +159,12 @@ def _perform_scan(args, plugins):
):
args.exclude_lines = old_baseline['exclude']['lines']

if (
not args.word_list_file
and old_baseline.get('word_list')
):
args.word_list_file = old_baseline['word_list']['file']

# If we have knowledge of an existing baseline file, we should use
# that knowledge and add it to our exclude_files regex.
if args.import_filename:
Expand All @@ -148,6 +174,8 @@ def _perform_scan(args, plugins):
plugins=plugins,
exclude_files_regex=args.exclude_files,
exclude_lines_regex=args.exclude_lines,
word_list_file=args.word_list_file,
word_list_hash=word_list_hash,
path=args.path,
should_scan_all_files=args.all_files,
).format_for_baseline_output()
Expand Down
33 changes: 31 additions & 2 deletions detect_secrets/plugins/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,11 @@ def adhoc_scan(self, string):
<classname>: <returned-value>
"""
# TODO: Handle multiple secrets on single line.
results = self.analyze_string(string, 0, 'does_not_matter')
results = self.analyze_string(
string,
line_num=0,
filename='does_not_matter',
)
if not results:
return 'False'

Expand Down Expand Up @@ -193,7 +197,7 @@ def __dict__(self):


class RegexBasedDetector(BasePlugin):
"""Base class for regular-expression based detectors.
"""Parent class for regular-expression based detectors.
To create a new regex-based detector, subclass this and set
`secret_type` with a description and `denylist`
Expand Down Expand Up @@ -235,3 +239,28 @@ def secret_generator(self, string, *args, **kwargs):
for regex in self.denylist:
for match in regex.findall(string):
yield match


class WordListSupportedDetector(BasePlugin):
"""Parent class for detectors supporting a word list.
To create a new word list supported detector, subclass this
and pass `automaton` in __init__ like:
class BarDetector(WordListSupportedDetector):
secret_type = "bar"
def __init__(self, automaton=None, **kwargs):
super(BarDetector, self).__init__(
automaton,
**kwargs
)
...
"""
__metaclass__ = ABCMeta

def __init__(self, automaton=None, **kwargs):
super(WordListSupportedDetector, self).__init__(**kwargs)

self.automaton = automaton
Loading

0 comments on commit f8cb31f

Please sign in to comment.