From f8cb31fd4d4cd8a5b1e1a5f2846961ded80553b5 Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Wed, 18 Sep 2019 21:47:26 -0700 Subject: [PATCH] :tada: Add --word-list option - Add `pyahocorasick` as an optional dependency See issue #240 for more information. --- detect_secrets/core/baseline.py | 14 ++++- detect_secrets/core/secrets_collection.py | 29 +++++++++- detect_secrets/core/usage.py | 22 ++++++- detect_secrets/main.py | 38 +++++++++++-- detect_secrets/plugins/base.py | 33 ++++++++++- detect_secrets/plugins/common/filters.py | 57 ++++++++++++++++--- detect_secrets/plugins/common/initialize.py | 33 +++++++++-- .../plugins/high_entropy_strings.py | 42 ++++++-------- detect_secrets/plugins/keyword.py | 13 +++-- detect_secrets/pre_commit_hook.py | 12 +++- detect_secrets/util.py | 49 +++++++++++++++- requirements-dev.txt | 1 + setup.py | 3 + test_data/word_list.txt | 1 + testing/factories.py | 19 ++++++- tests/core/secrets_collection_test.py | 57 +++++++++++++++++-- tests/main_test.py | 8 +++ tests/plugins/common/filters_test.py | 10 ++-- tests/plugins/high_entropy_strings_test.py | 2 + tests/plugins/keyword_test.py | 24 ++++++++ tests/pre_commit_hook_test.py | 11 +++- tests/util_test.py | 23 ++++++++ 22 files changed, 425 insertions(+), 76 deletions(-) create mode 100644 test_data/word_list.txt diff --git a/detect_secrets/core/baseline.py b/detect_secrets/core/baseline.py index bff8da118..6e573ac92 100644 --- a/detect_secrets/core/baseline.py +++ b/detect_secrets/core/baseline.py @@ -18,17 +18,27 @@ def initialize( plugins, exclude_files_regex=None, exclude_lines_regex=None, + word_list_file=None, + word_list_hash=None, should_scan_all_files=False, ): """Scans the entire codebase for secrets, and returns a SecretsCollection object. + :type path: list + :type plugins: tuple of detect_secrets.plugins.base.BasePlugin :param plugins: rules to initialize the SecretsCollection with. :type exclude_files_regex: str|None :type exclude_lines_regex: str|None - :type path: list + + :type word_list_file: str|None + :param word_list_file: optional word list file for ignoring certain words. + + :type word_list_hash: str|None + :param word_list_hash: optional iterated sha1 hash of the words in the word list. + :type should_scan_all_files: bool :rtype: SecretsCollection @@ -37,6 +47,8 @@ def initialize( plugins, exclude_files=exclude_files_regex, exclude_lines=exclude_lines_regex, + word_list_file=word_list_file, + word_list_hash=word_list_hash, ) files_to_scan = [] diff --git a/detect_secrets/core/secrets_collection.py b/detect_secrets/core/secrets_collection.py index 9266dd15f..8ef68ae2b 100644 --- a/detect_secrets/core/secrets_collection.py +++ b/detect_secrets/core/secrets_collection.py @@ -12,6 +12,7 @@ from detect_secrets.core.log import log from detect_secrets.core.potential_secret import PotentialSecret from detect_secrets.plugins.common import initialize +from detect_secrets.util import build_automaton class SecretsCollection(object): @@ -21,6 +22,8 @@ def __init__( plugins=(), exclude_files=None, exclude_lines=None, + word_list_file=None, + word_list_hash=None, ): """ :type plugins: tuple of detect_secrets.plugins.base.BasePlugin @@ -32,14 +35,18 @@ def __init__( :type exclude_lines: str|None :param exclude_lines: optional regex for ignored lines. - :type version: str - :param version: version of detect-secrets that SecretsCollection - is valid at. + :type word_list_file: str|None + :param word_list_file: optional word list file for ignoring certain words. + + :type word_list_hash: str|None + :param word_list_hash: optional iterated sha1 hash of the words in the word list. """ self.data = {} self.plugins = plugins self.exclude_files = exclude_files self.exclude_lines = exclude_lines + self.word_list_file = word_list_file + self.word_list_hash = word_list_hash self.version = VERSION @classmethod @@ -93,6 +100,17 @@ def load_baseline_from_dict(cls, data): result.exclude_files = data['exclude']['files'] result.exclude_lines = data['exclude']['lines'] + # In v0.12.7 the `--word-list` option got added + automaton = None + if 'word_list' in data: + result.word_list_file = data['word_list']['file'] + result.word_list_hash = data['word_list']['hash'] + + if result.word_list_file: + # Always ignore the given `data['word_list']['hash']` + # The difference will show whenever the word list changes + automaton, result.word_list_hash = build_automaton(result.word_list_file) + plugins = [] for plugin in data['plugins_used']: plugin_classname = plugin.pop('name') @@ -100,6 +118,7 @@ def load_baseline_from_dict(cls, data): initialize.from_plugin_classname( plugin_classname, exclude_lines_regex=result.exclude_lines, + automaton=automaton, should_verify_secrets=False, **plugin ), @@ -277,6 +296,10 @@ def format_for_baseline_output(self): 'files': self.exclude_files, 'lines': self.exclude_lines, }, + 'word_list': { + 'file': self.word_list_file, + 'hash': self.word_list_hash, + }, 'plugins_used': plugins_used, 'results': results, 'version': self.version, diff --git a/detect_secrets/core/usage.py b/detect_secrets/core/usage.py index c66b2cf0d..3e933d7a4 100644 --- a/detect_secrets/core/usage.py +++ b/detect_secrets/core/usage.py @@ -14,6 +14,18 @@ def add_exclude_lines_argument(parser): ) +def add_word_list_argument(parser): + parser.add_argument( + '--word-list', + type=str, + help=( + 'Text file with a list of words, ' + 'if a secret contains a word in the list we ignore it.' + ), + dest='word_list_file', + ) + + def add_use_all_plugins_argument(parser): parser.add_argument( '--use-all-plugins', @@ -46,6 +58,7 @@ def add_pre_commit_arguments(self): self._add_filenames_argument()\ ._add_set_baseline_argument()\ ._add_exclude_lines_argument()\ + ._add_word_list_argument()\ ._add_use_all_plugins_argument()\ ._add_no_verify_flag() @@ -108,6 +121,10 @@ def _add_exclude_lines_argument(self): add_exclude_lines_argument(self.parser) return self + def _add_word_list_argument(self): + add_word_list_argument(self.parser) + return self + def _add_use_all_plugins_argument(self): add_use_all_plugins_argument(self.parser) return self @@ -143,9 +160,10 @@ def _add_initialize_baseline_argument(self): ), ) - # Pairing `--exclude-lines` to both pre-commit and `--scan` - # because it can be used for both. + # Pairing `--exclude-lines` and `--word-list` to + # both pre-commit and `--scan` because it can be used for both. add_exclude_lines_argument(self.parser) + add_word_list_argument(self.parser) # Pairing `--exclude-files` with `--scan` because it's only used for the initialization. # The pre-commit hook framework already has an `exclude` option that can diff --git a/detect_secrets/main.py b/detect_secrets/main.py index f6f0d5965..c939bfd28 100644 --- a/detect_secrets/main.py +++ b/detect_secrets/main.py @@ -13,6 +13,7 @@ from detect_secrets.core.secrets_collection import SecretsCollection from detect_secrets.core.usage import ParserBuilder from detect_secrets.plugins.common import initialize +from detect_secrets.util import build_automaton def parse_args(argv): @@ -30,11 +31,17 @@ def main(argv=None): log.set_debug_level(args.verbose) if args.action == 'scan': + automaton = None + word_list_hash = None + if args.word_list_file: + automaton, word_list_hash = build_automaton(args.word_list_file) + # Plugins are *always* rescanned with fresh settings, because # we want to get the latest updates. plugins = initialize.from_parser_builder( args.plugins, exclude_lines_regex=args.exclude_lines, + automaton=automaton, should_verify_secrets=not args.no_verify, ) if args.string: @@ -46,7 +53,12 @@ def main(argv=None): _scan_string(line, plugins) else: - baseline_dict = _perform_scan(args, plugins) + baseline_dict = _perform_scan( + args, + plugins, + automaton, + word_list_hash, + ) if args.import_filename: write_baseline_to_file( @@ -87,7 +99,7 @@ def main(argv=None): return 0 -def _get_plugin_from_baseline(old_baseline): +def _get_plugins_from_baseline(old_baseline): plugins = [] if old_baseline and 'plugins_used' in old_baseline: secrets_collection = SecretsCollection.load_baseline_from_dict(old_baseline) @@ -114,17 +126,25 @@ def _scan_string(line, plugins): print('\n'.join(sorted(output))) -def _perform_scan(args, plugins): +def _perform_scan(args, plugins, automaton, word_list_hash): """ :param args: output of `argparse.ArgumentParser.parse_args` :param plugins: tuple of initialized plugins + :type automaton: ahocorasick.Automaton|None + :param automaton: optional automaton for ignoring certain words. + + :type word_list_hash: str|None + :param word_list_hash: optional iterated sha1 hash of the words in the word list. + :rtype: dict """ old_baseline = _get_existing_baseline(args.import_filename) if old_baseline: - plugins = initialize.merge_plugin_from_baseline( - _get_plugin_from_baseline(old_baseline), args, + plugins = initialize.merge_plugins_from_baseline( + _get_plugins_from_baseline(old_baseline), + args, + automaton=automaton, ) # Favors `--exclude-files` and `--exclude-lines` CLI arguments @@ -139,6 +159,12 @@ def _perform_scan(args, plugins): ): args.exclude_lines = old_baseline['exclude']['lines'] + if ( + not args.word_list_file + and old_baseline.get('word_list') + ): + args.word_list_file = old_baseline['word_list']['file'] + # If we have knowledge of an existing baseline file, we should use # that knowledge and add it to our exclude_files regex. if args.import_filename: @@ -148,6 +174,8 @@ def _perform_scan(args, plugins): plugins=plugins, exclude_files_regex=args.exclude_files, exclude_lines_regex=args.exclude_lines, + word_list_file=args.word_list_file, + word_list_hash=word_list_hash, path=args.path, should_scan_all_files=args.all_files, ).format_for_baseline_output() diff --git a/detect_secrets/plugins/base.py b/detect_secrets/plugins/base.py index f3e0005ca..d4695fec1 100644 --- a/detect_secrets/plugins/base.py +++ b/detect_secrets/plugins/base.py @@ -148,7 +148,11 @@ def adhoc_scan(self, string): : """ # TODO: Handle multiple secrets on single line. - results = self.analyze_string(string, 0, 'does_not_matter') + results = self.analyze_string( + string, + line_num=0, + filename='does_not_matter', + ) if not results: return 'False' @@ -193,7 +197,7 @@ def __dict__(self): class RegexBasedDetector(BasePlugin): - """Base class for regular-expression based detectors. + """Parent class for regular-expression based detectors. To create a new regex-based detector, subclass this and set `secret_type` with a description and `denylist` @@ -235,3 +239,28 @@ def secret_generator(self, string, *args, **kwargs): for regex in self.denylist: for match in regex.findall(string): yield match + + +class WordListSupportedDetector(BasePlugin): + """Parent class for detectors supporting a word list. + + To create a new word list supported detector, subclass this + and pass `automaton` in __init__ like: + + class BarDetector(WordListSupportedDetector): + + secret_type = "bar" + + def __init__(self, automaton=None, **kwargs): + super(BarDetector, self).__init__( + automaton, + **kwargs + ) + ... + """ + __metaclass__ = ABCMeta + + def __init__(self, automaton=None, **kwargs): + super(WordListSupportedDetector, self).__init__(**kwargs) + + self.automaton = automaton diff --git a/detect_secrets/plugins/common/filters.py b/detect_secrets/plugins/common/filters.py index a6c02b2f5..e0fd0ebf1 100644 --- a/detect_secrets/plugins/common/filters.py +++ b/detect_secrets/plugins/common/filters.py @@ -1,27 +1,66 @@ """ -Heuristic, false positive filters that are shared across all plugin types. +False positive heuristic filters that are shared across all plugin types. This abstraction allows for development of later ML work, or further heuristical determinations (e.g. word filter, entropy comparator). """ import string +from detect_secrets.util import is_python_2 -def is_false_positive(secret): + +def is_false_positive(secret, automaton): + """ + :type secret: str + + :type automaton: ahocorasick.Automaton|None + :param automaton: optional automaton for ignoring certain words. + + :rtype: bool + Returns True if any false positive heuristic function returns True. + """ return any( - func(secret) + func(secret, automaton) for func in ( - is_sequential_string, + _is_found_with_aho_corasick, + _is_sequential_string, ) ) -def is_sequential_string(secret): +def _is_found_with_aho_corasick(secret, automaton): + """ + :type secret: str + + :type automaton: ahocorasick.Automaton|None + :param automaton: optional automaton for ignoring certain words. + + :rtype: bool + Returns True if secret contains a word in the automaton. """ - Returns true if string is sequential. + if not automaton: + return False + + if is_python_2(): # pragma: no cover + # Due to pyahocorasick + secret = secret.encode('utf-8') + + try: + next(automaton.iter(string=secret)) + return True + except StopIteration: + return False + + +def _is_sequential_string(secret, *args): + """ + :type secret: str + + :rtype: bool + Returns True if string is sequential. """ sequences = ( - # base64 letters first + # Base64 letters first ( string.ascii_uppercase + string.ascii_uppercase + @@ -29,7 +68,7 @@ def is_sequential_string(secret): '+/' ), - # base64 numbers first + # Base64 numbers first ( string.digits + string.ascii_uppercase + @@ -41,7 +80,7 @@ def is_sequential_string(secret): # sequences, since those will happen to be caught by the # base64 checks. - # alphanumeric sequences + # Alphanumeric sequences (string.digits + string.ascii_uppercase) * 2, # Capturing any number sequences diff --git a/detect_secrets/plugins/common/initialize.py b/detect_secrets/plugins/common/initialize.py index d051c8ee3..fb8696d01 100644 --- a/detect_secrets/plugins/common/initialize.py +++ b/detect_secrets/plugins/common/initialize.py @@ -19,6 +19,7 @@ def from_parser_builder( plugins_dict, exclude_lines_regex=None, + automaton=None, should_verify_secrets=False, ): """ @@ -28,16 +29,21 @@ def from_parser_builder( :type exclude_lines_regex: str|None :param exclude_lines_regex: optional regex for ignored lines. + :type automaton: ahocorasick.Automaton|None + :param automaton: optional automaton for ignoring certain words. + :type should_verify_secrets: bool :returns: tuple of initialized plugins """ output = [] + for plugin_name in plugins_dict: output.append( from_plugin_classname( plugin_name, exclude_lines_regex=exclude_lines_regex, + automaton=automaton, should_verify_secrets=should_verify_secrets, **plugins_dict[plugin_name] ), @@ -65,7 +71,7 @@ def _get_prioritized_parameters(plugins_dict, is_using_default_value_map, prefer yield plugin_name, param_name, param_value -def merge_plugin_from_baseline(baseline_plugins, args): +def merge_plugins_from_baseline(baseline_plugins, args, automaton): """ :type baseline_plugins: tuple of BasePlugin :param baseline_plugins: BasePlugin instances from baseline file @@ -73,7 +79,10 @@ def merge_plugin_from_baseline(baseline_plugins, args): :type args: dict :param args: dictionary of arguments parsed from usage - param priority: input param > baseline param > default + :type automaton: ahocorasick.Automaton|None + :param automaton: optional automaton for ignoring certain words. + + param priority is input param > baseline param > default :returns: tuple of initialized plugins """ @@ -89,10 +98,10 @@ def _remove_key(d, key): # Use input plugin as starting point if args.use_all_plugins: - # input param and default param are used + # Input param and default param are used plugins_dict = dict(args.plugins) - # baseline param priority > default + # Baseline param priority > default for plugin_name, param_name, param_value in _get_prioritized_parameters( baseline_plugins_dict, args.is_using_default_value, @@ -109,6 +118,7 @@ def _remove_key(d, key): return from_parser_builder( plugins_dict, exclude_lines_regex=args.exclude_lines, + automaton=automaton, should_verify_secrets=not args.no_verify, ) @@ -120,7 +130,7 @@ def _remove_key(d, key): if plugin_name not in disabled_plugins } - # input param priority > baseline + # Input param priority > baseline input_plugins_dict = dict(args.plugins) for plugin_name, param_name, param_value in _get_prioritized_parameters( input_plugins_dict, @@ -139,12 +149,14 @@ def _remove_key(d, key): return from_parser_builder( plugins_dict, exclude_lines_regex=args.exclude_lines, + automaton=automaton, ) def from_plugin_classname( plugin_classname, exclude_lines_regex=None, + automaton=None, should_verify_secrets=False, **kwargs ): @@ -155,6 +167,11 @@ def from_plugin_classname( :type exclude_lines_regex: str|None :param exclude_lines_regex: optional regex for ignored lines. + + :type automaton: ahocorasick.Automaton|None + :param automaton: optional automaton for ignoring English-words. + + :type should_verify_secrets: bool """ klass = globals()[plugin_classname] @@ -165,6 +182,7 @@ def from_plugin_classname( try: instance = klass( exclude_lines_regex=exclude_lines_regex, + automaton=automaton, should_verify=should_verify_secrets, **kwargs ) @@ -207,7 +225,10 @@ def from_secret_type(secret_type, settings): return from_plugin_classname( classname, - # `audit` doesn't need verification + # `audit` does not need to + # perform exclusion, filtering or verification + exclude_lines_regex=None, + automaton=None, should_verify_secrets=False, **plugin_init_vars diff --git a/detect_secrets/plugins/high_entropy_strings.py b/detect_secrets/plugins/high_entropy_strings.py index 7262db607..82aa3c739 100644 --- a/detect_secrets/plugins/high_entropy_strings.py +++ b/detect_secrets/plugins/high_entropy_strings.py @@ -6,7 +6,6 @@ import configparser import base64 import math -import os import re import string from abc import ABCMeta @@ -15,27 +14,23 @@ import yaml -from .base import BasePlugin +from .base import WordListSupportedDetector +from .common.filetype import determine_file_type +from .common.filetype import FileType from .common.filters import is_false_positive from .common.ini_file_parser import IniFileParser from .common.yaml_file_parser import YamlFileParser from detect_secrets.core.potential_secret import PotentialSecret -YAML_EXTENSIONS = ( - '.yaml', - '.yml', -) - - -class HighEntropyStringsPlugin(BasePlugin): +class HighEntropyStringsPlugin(WordListSupportedDetector): """Base class for string pattern matching""" __metaclass__ = ABCMeta secret_type = 'High Entropy String' - def __init__(self, charset, limit, exclude_lines_regex, *args): + def __init__(self, charset, limit, exclude_lines_regex, automaton, *args): if limit < 0 or limit > 8: raise ValueError( 'The limit set for HighEntropyStrings must be between 0.0 and 8.0', @@ -47,6 +42,7 @@ def __init__(self, charset, limit, exclude_lines_regex, *args): super(HighEntropyStringsPlugin, self).__init__( exclude_lines_regex=exclude_lines_regex, + automaton=automaton, ) def analyze(self, file, filename): @@ -95,7 +91,7 @@ def analyze_string_content(self, string, line_num, filename): output = {} for result in self.secret_generator(string): - if is_false_positive(result): + if is_false_positive(result, self.automaton): continue secret = PotentialSecret(self.secret_type, filename, result, line_num) @@ -135,22 +131,16 @@ def adhoc_scan(self, string): return output @contextmanager - def non_quoted_string_regex(self, strict=True): + def non_quoted_string_regex(self): """For certain file formats, strings need not necessarily follow the normal convention of being denoted by single or double quotes. In these cases, we modify the regex accordingly. Public, because detect_secrets.core.audit needs to reference it. - - :type strict: bool - :param strict: if True, the regex will match the entire string. """ old_regex = self.regex - regex_alternative = r'([{}]+)'.format(re.escape(self.charset)) - if strict: - regex_alternative = r'^' + regex_alternative + r'$' - + regex_alternative = r'^([{}]+)$'.format(re.escape(self.charset)) self.regex = re.compile(regex_alternative) try: @@ -187,7 +177,7 @@ def _analyze_yaml_file(self, file, filename): """ :returns: same format as super().analyze() """ - if os.path.splitext(filename)[1] not in YAML_EXTENSIONS: + if determine_file_type(filename) != FileType.YAML: # The yaml parser is pretty powerful. It eagerly # parses things when it's not even a yaml file. Therefore, # we use this heuristic to quit early if appropriate. @@ -210,9 +200,11 @@ def _analyze_yaml_file(self, file, filename): if '__line__' in item and item['__line__'] not in ignored_lines: # An isinstance check doesn't work in py2 # so we need the __is_binary__ field. - string_to_scan = self.decode_binary(item['__value__']) \ - if item['__is_binary__'] \ + string_to_scan = ( + self.decode_binary(item['__value__']) + if item['__is_binary__'] else item['__value__'] + ) secrets = self.analyze_string( string_to_scan, @@ -276,11 +268,12 @@ class HexHighEntropyString(HighEntropyStringsPlugin): secret_type = 'Hex High Entropy String' - def __init__(self, hex_limit, exclude_lines_regex=None, **kwargs): + def __init__(self, hex_limit, exclude_lines_regex=None, automaton=None, **kwargs): super(HexHighEntropyString, self).__init__( charset=string.hexdigits, limit=hex_limit, exclude_lines_regex=exclude_lines_regex, + automaton=automaton, ) @property @@ -334,11 +327,12 @@ class Base64HighEntropyString(HighEntropyStringsPlugin): secret_type = 'Base64 High Entropy String' - def __init__(self, base64_limit, exclude_lines_regex=None, **kwargs): + def __init__(self, base64_limit, exclude_lines_regex=None, automaton=None, **kwargs): super(Base64HighEntropyString, self).__init__( charset=string.ascii_letters + string.digits + '+/=', limit=base64_limit, exclude_lines_regex=exclude_lines_regex, + automaton=automaton, ) @property diff --git a/detect_secrets/plugins/keyword.py b/detect_secrets/plugins/keyword.py index e9cca6bc7..d75269370 100644 --- a/detect_secrets/plugins/keyword.py +++ b/detect_secrets/plugins/keyword.py @@ -28,9 +28,10 @@ import re -from .base import BasePlugin +from .base import WordListSupportedDetector from .common.filetype import determine_file_type from .common.filetype import FileType +from .common.filters import is_false_positive from detect_secrets.core.potential_secret import PotentialSecret @@ -246,16 +247,18 @@ } -class KeywordDetector(BasePlugin): +class KeywordDetector(WordListSupportedDetector): """This checks if denylisted keywords are present in the analyzed string. """ secret_type = 'Secret Keyword' - def __init__(self, keyword_exclude=None, exclude_lines_regex=None, **kwargs): + def __init__(self, keyword_exclude=None, exclude_lines_regex=None, automaton=None, **kwargs): super(KeywordDetector, self).__init__( - exclude_lines_regex, + exclude_lines_regex=exclude_lines_regex, + automaton=automaton, + **kwargs ) self.keyword_exclude = None @@ -276,6 +279,8 @@ def analyze_string_content(self, string, line_num, filename): string, filetype=determine_file_type(filename), ): + if is_false_positive(identifier, self.automaton): + continue secret = PotentialSecret( self.secret_type, filename, diff --git a/detect_secrets/pre_commit_hook.py b/detect_secrets/pre_commit_hook.py index 871be44a2..0543f02cc 100644 --- a/detect_secrets/pre_commit_hook.py +++ b/detect_secrets/pre_commit_hook.py @@ -13,6 +13,7 @@ from detect_secrets.core.secrets_collection import SecretsCollection from detect_secrets.core.usage import ParserBuilder from detect_secrets.plugins.common import initialize +from detect_secrets.util import build_automaton log = get_logger(format_string='%(message)s') @@ -37,17 +38,24 @@ def main(argv=None): # Error logs handled within logic. return 1 + automaton = None + word_list_hash = None + if args.word_list_file: + automaton, word_list_hash = build_automaton(args.word_list_file) + plugins = initialize.from_parser_builder( args.plugins, exclude_lines_regex=args.exclude_lines, + automaton=automaton, should_verify_secrets=not args.no_verify, ) - # Merge plugin from baseline + # Merge plugins from baseline if baseline_collection: - plugins = initialize.merge_plugin_from_baseline( + plugins = initialize.merge_plugins_from_baseline( baseline_collection.plugins, args, + automaton, ) baseline_collection.plugins = plugins diff --git a/detect_secrets/util.py b/detect_secrets/util.py index b11c30d54..bb8f688c7 100644 --- a/detect_secrets/util.py +++ b/detect_secrets/util.py @@ -1,8 +1,51 @@ +import hashlib import os import subprocess +import sys -def get_root_directory(): # pragma: no cover +def is_python_2(): + return sys.version_info[0] < 3 + + +def build_automaton(word_list): + """ + :type word_list: str + :param word_list: optional word list file for ignoring certain words. + + :rtype: (ahocorasick.Automaton, str) + :returns: an automaton, and an iterated sha1 hash of the words in the word list. + """ + # Dynamic import due to optional-dependency + try: + import ahocorasick + except ImportError: # pragma: no cover + print('Please install the `pyahocorasick` package to use --word-list') + raise + + # See https://pyahocorasick.readthedocs.io/en/latest/ + # for more information. + automaton = ahocorasick.Automaton() + word_list_hash = '' + + with open(word_list) as f: + for line in f: + line = line.strip() + if line: + word_list_hash = hashlib.sha1( + (word_list_hash + line).encode('utf-8'), + ).hexdigest() + automaton.add_word(line, line) + + automaton.make_automaton() + + return ( + automaton, + word_list_hash, + ) + + +def get_root_directory(): # pragma: no cover return os.path.realpath( os.path.join( os.path.dirname(__file__), @@ -19,7 +62,7 @@ def get_relative_path(root, path): def get_git_sha(path): - """Returns the sha of the git checkout at the input path + """Returns the sha of the git checkout at the input path. :type path: str :param path: directory of the git checkout @@ -40,7 +83,7 @@ def get_git_sha(path): def get_git_remotes(path): """Returns a list of unique git remotes of the checkout - at the input path + at the input path. :type path: str :param path: directory of the git checkout diff --git a/requirements-dev.txt b/requirements-dev.txt index 2f6cc5934..c666ed3ff 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,6 +3,7 @@ flake8==3.5.0 mock monotonic pre-commit==1.11.2 +pyahocorasick pytest pyyaml tox>=3.8 diff --git a/setup.py b/setup.py index 76c1cda86..1fd9e5541 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,9 @@ 'future', 'functools32', ], + 'word_list': [ + 'pyahocorasick', + ], }, entry_points={ 'console_scripts': [ diff --git a/test_data/word_list.txt b/test_data/word_list.txt new file mode 100644 index 000000000..7c3fc6a28 --- /dev/null +++ b/test_data/word_list.txt @@ -0,0 +1 @@ +c3VwZXIg diff --git a/testing/factories.py b/testing/factories.py index 6c3ea0467..db58cee6a 100644 --- a/testing/factories.py +++ b/testing/factories.py @@ -4,18 +4,29 @@ from detect_secrets.core.secrets_collection import SecretsCollection -def potential_secret_factory(type_='type', filename='filename', secret='secret', lineno=1): +def potential_secret_factory( + type_='type', + filename='filename', + secret='secret', + lineno=1, +): """This is only marginally better than creating PotentialSecret objects directly, because of the default values. """ return PotentialSecret(type_, filename, secret, lineno) -def secrets_collection_factory(secrets=None, plugins=(), exclude_files_regex=None): +def secrets_collection_factory( + secrets=None, + plugins=(), + exclude_files_regex=None, + word_list_file=None, + word_list_hash=None, +): """ :type secrets: list(dict) :param secrets: list of params to pass to add_secret. - E.g. [ {'secret': 'blah'}, ] + e.g. [ {'secret': 'blah'}, ] :type plugins: tuple :type exclude_files_regex: str|None @@ -25,6 +36,8 @@ def secrets_collection_factory(secrets=None, plugins=(), exclude_files_regex=Non collection = SecretsCollection( plugins, exclude_files=exclude_files_regex, + word_list_file=word_list_file, + word_list_hash=word_list_hash, ) if plugins: diff --git a/tests/core/secrets_collection_test.py b/tests/core/secrets_collection_test.py index b43e6e3b3..5c8860ae9 100644 --- a/tests/core/secrets_collection_test.py +++ b/tests/core/secrets_collection_test.py @@ -1,5 +1,6 @@ from __future__ import absolute_import +import hashlib import json from contextlib import contextmanager from time import gmtime @@ -287,13 +288,14 @@ def setup(self): PrivateKeyDetector(), ), exclude_files_regex='foo', + word_list_file='will_be_mocked.txt', + word_list_hash='5baa61e4c9b93f3f0682250b6cf8331b7ee68fd8', ) def test_output(self, mock_gmtime): assert ( self.logic.format_for_baseline_output() - - == self.get_point_twelve_and_later_baseline_dict(mock_gmtime) + == self.get_point_twelve_point_seven_and_later_baseline_dict(mock_gmtime) ) def test_load_baseline_from_string_with_pre_point_twelve_string(self, mock_gmtime): @@ -312,12 +314,12 @@ def test_load_baseline_from_string_with_pre_point_twelve_string(self, mock_gmtim assert secrets['exclude']['lines'] is None assert old_original['results'] == secrets['results'] - def test_load_baseline_from_string_with_point_twelve_and_later_string(self, mock_gmtime): + def test_load_baseline_from_string_with_point_twelve_to_twelve_six_string(self, mock_gmtime): """ We use load_baseline_from_string as a proxy to testing load_baseline_from_dict, because it's the most entry into the private function. """ - original = self.get_point_twelve_and_later_baseline_dict(mock_gmtime) + original = self.get_point_twelve_to_twelve_six_later_baseline_dict(mock_gmtime) secrets = SecretsCollection.load_baseline_from_string( json.dumps(original), @@ -327,6 +329,43 @@ def test_load_baseline_from_string_with_point_twelve_and_later_string(self, mock assert secrets['exclude']['lines'] is None assert original['results'] == secrets['results'] + def test_load_baseline_from_string_with_point_twelve_point_seven_and_later_string( + self, + mock_gmtime, + ): + """ + We use load_baseline_from_string as a proxy to testing load_baseline_from_dict, + because it's the most entry into the private function. + """ + original = self.get_point_twelve_point_seven_and_later_baseline_dict(mock_gmtime) + + word_list = """ + roller\n + """ + with mock_open_base( + data=word_list, + namespace='detect_secrets.util.open', + ): + secrets = SecretsCollection.load_baseline_from_string( + json.dumps(original), + ).format_for_baseline_output() + + # v0.12.7+ assertions + assert original['word_list']['file'] == secrets['word_list']['file'] + # Original hash is thrown out and replaced with new word list hash + assert ( + secrets['word_list']['hash'] + == + hashlib.sha1('roller'.encode('utf-8')).hexdigest() + != + original['word_list']['hash'] + ) + + # Regular assertions + assert original['exclude']['files'] == secrets['exclude']['files'] + assert secrets['exclude']['lines'] is None + assert original['results'] == secrets['results'] + def test_load_baseline_without_any_valid_fields(self, mock_log): with pytest.raises(IOError): SecretsCollection.load_baseline_from_string( @@ -346,7 +385,15 @@ def test_load_baseline_without_exclude(self, mock_log): ) assert mock_log.error_messages == 'Incorrectly formatted baseline!\n' - def get_point_twelve_and_later_baseline_dict(self, gmtime): + def get_point_twelve_point_seven_and_later_baseline_dict(self, gmtime): + # In v0.12.7 --word-list got added + baseline = self.get_point_twelve_to_twelve_six_later_baseline_dict(gmtime) + baseline['word_list'] = {} + baseline['word_list']['file'] = 'will_be_mocked.txt' + baseline['word_list']['hash'] = '5baa61e4c9b93f3f0682250b6cf8331b7ee68fd8' + return baseline + + def get_point_twelve_to_twelve_six_later_baseline_dict(self, gmtime): # In v0.12.0 `exclude_regex` got replaced by `exclude` baseline = self._get_baseline_dict(gmtime) baseline['exclude'] = {} diff --git a/tests/main_test.py b/tests/main_test.py index ec69d6252..fc0239040 100644 --- a/tests/main_test.py +++ b/tests/main_test.py @@ -31,6 +31,8 @@ def test_scan_basic(self, mock_baseline_initialize): exclude_lines_regex=None, path='.', should_scan_all_files=False, + word_list_file=None, + word_list_hash=None, ) def test_scan_with_rootdir(self, mock_baseline_initialize): @@ -43,6 +45,8 @@ def test_scan_with_rootdir(self, mock_baseline_initialize): exclude_lines_regex=None, path=['test_data'], should_scan_all_files=False, + word_list_file=None, + word_list_hash=None, ) def test_scan_with_exclude_args(self, mock_baseline_initialize): @@ -57,6 +61,8 @@ def test_scan_with_exclude_args(self, mock_baseline_initialize): exclude_lines_regex='other_patt', path='.', should_scan_all_files=False, + word_list_file=None, + word_list_hash=None, ) @pytest.mark.parametrize( @@ -139,6 +145,8 @@ def test_scan_with_all_files_flag(self, mock_baseline_initialize): exclude_lines_regex=None, path='.', should_scan_all_files=True, + word_list_file=None, + word_list_hash=None, ) def test_reads_from_stdin(self, mock_merge_baseline): diff --git a/tests/plugins/common/filters_test.py b/tests/plugins/common/filters_test.py index 007d4d089..77e8c93a6 100644 --- a/tests/plugins/common/filters_test.py +++ b/tests/plugins/common/filters_test.py @@ -6,12 +6,10 @@ class TestIsSequentialString: - # TODO: More tests should be had. - @pytest.mark.parametrize( 'secret', ( - # ascii sequence + # ASCII sequence 'ABCDEF', 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', @@ -27,13 +25,13 @@ class TestIsSequentialString: '0123456789abcdef', 'abcdef0123456789', - # base64 sequences + # Base64 sequences 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/', '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/', ), ) def test_success(self, secret): - assert filters.is_sequential_string(secret) + assert filters._is_sequential_string(secret) @pytest.mark.parametrize( 'secret', @@ -42,4 +40,4 @@ def test_success(self, secret): ), ) def test_failure(self, secret): - assert not filters.is_sequential_string(secret) + assert not filters._is_sequential_string(secret) diff --git a/tests/plugins/high_entropy_strings_test.py b/tests/plugins/high_entropy_strings_test.py index 97dffc4ea..c706718a2 100644 --- a/tests/plugins/high_entropy_strings_test.py +++ b/tests/plugins/high_entropy_strings_test.py @@ -293,3 +293,5 @@ def test_discounts_when_all_numbers(self): # This makes sure it only occurs with numbers. assert self.logic.calculate_shannon_entropy('12345a') == \ original_scanner.calculate_shannon_entropy('12345a') + assert self.logic.calculate_shannon_entropy('0') == \ + original_scanner.calculate_shannon_entropy('0') diff --git a/tests/plugins/keyword_test.py b/tests/plugins/keyword_test.py index 3b7eb4bf0..9b8866ba2 100644 --- a/tests/plugins/keyword_test.py +++ b/tests/plugins/keyword_test.py @@ -1,10 +1,12 @@ from __future__ import absolute_import from __future__ import unicode_literals +import ahocorasick import pytest from detect_secrets.core.potential_secret import PotentialSecret from detect_secrets.plugins.keyword import KeywordDetector +from detect_secrets.util import is_python_2 from testing.mocks import mock_file_object @@ -193,6 +195,28 @@ def test_analyze_standard_positives(self, file_content): == PotentialSecret.hash_secret('m{{h}o)p${e]nob(ody[finds>-_$#thisone}}') ) + @pytest.mark.parametrize( + 'file_content', + STANDARD_POSITIVES, + ) + def test_analyze_standard_positives_with_automaton(self, file_content): + automaton = ahocorasick.Automaton() + + word = 'thisone' + if is_python_2(): # pragma: no cover + # Due to pyahocorasick + word = word.encode('utf-8') + automaton.add_word(word, word) + + automaton.make_automaton() + + logic = KeywordDetector(automaton=automaton) + + f = mock_file_object(file_content) + output = logic.analyze(f, 'mock_filename') + # All skipped due to automaton + assert len(output) == 0 + @pytest.mark.parametrize( 'file_content', STANDARD_POSITIVES, diff --git a/tests/pre_commit_hook_test.py b/tests/pre_commit_hook_test.py index fb04c82e8..0aef962be 100644 --- a/tests/pre_commit_hook_test.py +++ b/tests/pre_commit_hook_test.py @@ -47,13 +47,18 @@ def test_file_with_secrets(self, mock_log): assert message_by_lines[3] == \ 'Location: test_data/files/file_with_secrets.py:3' + def test_file_with_secrets_with_word_list(self): + assert_commit_succeeds( + 'test_data/files/file_with_secrets.py --word-list test_data/word_list.txt', + ) + def test_file_no_secrets(self): assert_commit_succeeds('test_data/files/file_with_no_secrets.py') @pytest.mark.parametrize( 'has_result, use_private_key_scan, hook_command, commit_succeeds', [ - # basic case + # Basic case (True, True, '--baseline will_be_mocked test_data/files/file_with_secrets.py', True), # test_no_overwrite_pass_when_baseline_did_not_use_scanner (True, False, '--baseline will_be_mocked test_data/files/private_key', True), @@ -171,6 +176,10 @@ def test_that_baseline_gets_updated( assert original_baseline['exclude_regex'] == baseline_written['exclude']['files'] assert original_baseline['results'] == baseline_written['results'] + assert 'word_list' not in original_baseline + assert baseline_written['word_list']['file'] is None + assert baseline_written['word_list']['hash'] is None + # See that we updated the plugins and version assert current_version == baseline_written['version'] assert baseline_written['plugins_used'] == [ diff --git a/tests/util_test.py b/tests/util_test.py index bd03cdd68..db9c45ef8 100644 --- a/tests/util_test.py +++ b/tests/util_test.py @@ -1,9 +1,12 @@ +import hashlib import subprocess import mock import pytest from detect_secrets import util +from detect_secrets.plugins.common import filters +from testing.mocks import mock_open GIT_REPO_SHA = b'cbb33d8c545ccf5c55fdcc7d5b0218078598e677' GIT_REMOTES_VERBOSE_ONE_URL = ( @@ -18,6 +21,26 @@ ) +def test_build_automaton(): + word_list = """ + foam\n + """ + with mock_open( + data=word_list, + namespace='detect_secrets.util.open', + ): + automaton, word_list_hash = util.build_automaton(word_list='will_be_mocked.txt') + assert word_list_hash == hashlib.sha1('foam'.encode('utf-8')).hexdigest() + assert filters._is_found_with_aho_corasick( + secret='foam_roller', + automaton=automaton, + ) + assert not filters._is_found_with_aho_corasick( + secret='no_words_in_word_list', + automaton=automaton, + ) + + def test_get_git_sha(): with mock.patch.object( subprocess,