cortex: Turn analyzer PoC into job tracker

The current implementation of Cortex access has PoC characteristics in that it does not use a job tracker and block until a single job has finished. Turn the Cortex module into a job tracker analogous to Cuckoo. Add a class-based abstraction of the backend analyzers and their reports. Make the whole thing configurable through analyzers.conf. Unfortunately, the cortex4py module does not use requests sessions. So there is no easy way to keep the retry and backoff logic of the Cuckoo module. While at it, move analyzer-specific exceptions into their modules for clarity. TODO: - proper error and retry handling - test cases
hecg119 · Jun 24, 2021 · 15a0321 · 15a0321
1 parent b4471fb
commit 15a0321
Show file tree

Hide file tree

Showing 11 changed files with 661 additions and 212 deletions.
diff --git a/analyzers.conf.sample b/analyzers.conf.sample
@@ -24,3 +24,28 @@
 # New installations create a bearer token by default and require it but upgraded
 # installations don't automatically get one.
 #api_token        :    <empty>
+
+# Cortex analyzer settings
+[cortex]
+# where to reach the Cortex REST API
+#url: http://127.0.0.1:9001
+
+# Token to authenticate to the Cortex REST API with.
+#api_token        :    <empty>
+
+# how long to wait inbetween checks of job status
+#poll_interval: 5
+
+# Submit samples with their original filenames if available. Enhances
+# authenticity of analysis environment but also leaks original filenames into
+# Cortex's database.
+#submit_original_filename : yes
+
+# Specify how long to track running Cortex jobs before giving up on them. This
+# does not actively cancel jobs. It's rather meant to handle cases where jobs
+# have for some reason been dropped by or got stuck within Cortex. This value
+# is unrelated to how long our client is willing to wait for a result because
+# even if it gives up on us we would normally want to learn and cache the job
+# result because the analysis was expensive and the sample might be presented
+# to us again.
+#maximum_job_age : 900
diff --git a/docs/source/ruleset.rst b/docs/source/ruleset.rst
@@ -141,7 +141,7 @@ Attribues of cortexreport
 
 .. code-block:: shell
 
-    File_InfoReport.full
+    FileInfoReport.full
     HybridAnalysisReport.full
     VirusTotalQueryReport.n_of_all
     VirusTotalQueryReport.level

diff --git a/peekaboo/config.py b/peekaboo/config.py
@@ -456,13 +456,26 @@ def __init__(self, config_file=None):
         self.cuckoo_submit_original_filename = True
         self.cuckoo_maximum_job_age = 15*60
 
+        self.cortex_url = 'http://127.0.0.1:9001'
+        self.cortex_api_token = ''
+        self.cortex_poll_interval = 5
+        self.cortex_submit_original_filename = True
+        self.cortex_maximum_job_age = 15*60
+
         config_options = {
             'cuckoo_url': ['cuckoo', 'url'],
             'cuckoo_api_token': ['cuckoo', 'api_token'],
             'cuckoo_poll_interval': ['cuckoo', 'poll_interval'],
             'cuckoo_submit_original_filename': [
                 'cuckoo', 'submit_original_filename'],
             'cuckoo_maximum_job_age': ['cuckoo', 'maximum_job_age'],
+
+            'cortex_url': ['cortex', 'url'],
+            'cortex_api_token': ['cortex', 'api_token'],
+            'cortex_poll_interval': ['cortex', 'poll_interval'],
+            'cortex_submit_original_filename': [
+                'cortex', 'submit_original_filename'],
+            'cortex_maximum_job_age': ['cortex', 'maximum_job_age'],
         }
 
         # read configuration file. Note that we require a configuration file

diff --git a/peekaboo/exceptions.py b/peekaboo/exceptions.py
@@ -63,8 +63,3 @@ class PeekabooAnalysisDeferred(PeekabooRulesetException):
     take into account that the ruleset will be rerun from the very beginning.
     """
     pass
-
-
-class CuckooSubmitFailedException(PeekabooException):
-    """ An exception raised if submitting a job to Cuckoo fails. """
-    pass
diff --git a/peekaboo/ruleset/engine.py b/peekaboo/ruleset/engine.py
@@ -28,6 +28,7 @@
 from peekaboo.ruleset import Result, RuleResult
 from peekaboo.ruleset.rules import *
 from peekaboo.toolbox.cuckoo import Cuckoo
+from peekaboo.toolbox.cortex import Cortex
 from peekaboo.toolbox.peekabooyar import ContainsPeekabooYarRule
 from peekaboo.exceptions import PeekabooAnalysisDeferred, \
         PeekabooConfigException, PeekabooRulesetConfigError
@@ -79,6 +80,7 @@ def __init__(self, config, job_queue, db_con, analyzer_config):
         self.db_con = db_con
         self.analyzer_config = analyzer_config
         self.cuckoo = None
+        self.cortex = None
         self.rules = []
 
         self.shutdown_requested = False
@@ -147,6 +149,25 @@ def start(self):
 
                 rule.set_cuckoo_job_tracker(self.cuckoo)
 
+            if rule.uses_cortex:
+                if self.cortex is None:
+                    logger.debug(
+                        "Rule %s uses Cortex. Starting job tracker.", rule_name)
+
+                    self.cortex = Cortex(
+                        self.job_queue,
+                        self.analyzer_config.cortex_url,
+                        self.analyzer_config.cortex_api_token,
+                        self.analyzer_config.cortex_poll_interval,
+                        self.analyzer_config.cortex_submit_original_filename,
+                        self.analyzer_config.cortex_maximum_job_age)
+
+                    if not self.cortex.start_tracker():
+                        raise PeekabooRulesetConfigError(
+                            "Failure to initialize Cortex job tracker")
+
+                rule.set_cortex_job_tracker(self.cortex)
+
             self.rules.append(rule)
 
             # abort startup if we've been asked to shut down meanwhile
@@ -197,6 +218,9 @@ def shut_down_resources(self):
         if self.cuckoo is not None:
             self.cuckoo.shut_down()
 
+        if self.cortex is not None:
+            self.cortex.shut_down()
+
     def shut_down(self):
         """ Initiate asynchronous shutdown of the ruleset engine and dependent
         logic such as job trackers. """
@@ -207,3 +231,6 @@ def close_down(self):
         """ Finalize ruleset engine shutdown synchronously. """
         if self.cuckoo is not None:
             self.cuckoo.close_down()
+
+        if self.cortex is not None:
+            self.cortex.close_down()
diff --git a/peekaboo/ruleset/rules.py b/peekaboo/ruleset/rules.py
@@ -32,14 +32,14 @@
 from peekaboo.ruleset.expressions import ExpressionParser, \
         IdentifierMissingException
 from peekaboo.exceptions import PeekabooAnalysisDeferred, \
-        CuckooSubmitFailedException, PeekabooRulesetConfigError
+        PeekabooRulesetConfigError
 from peekaboo.sample import Sample
-from peekaboo.toolbox.cuckoo import CuckooReport
+from peekaboo.toolbox.cuckoo import CuckooReport, CuckooSubmitFailedException
 from peekaboo.toolbox.ole import Oletools, OletoolsReport
 from peekaboo.toolbox.file import Filetools, FiletoolsReport
 from peekaboo.toolbox.known import Knowntools, KnowntoolsReport
-from peekaboo.toolbox.cortex import Cortextools, CortexReport
-
+from peekaboo.toolbox.cortex import CortexReport, \
+        CortexSubmitFailedException, CortexAnalyzerReportMissingException
 
 logger = logging.getLogger(__name__)
 
@@ -50,6 +50,7 @@ class Rule:
     connection) or helper functions. """
     rule_name = 'unimplemented'
     uses_cuckoo = False
+    uses_cortex = False
 
     def __init__(self, config, db_con):
         """ Initialize common configuration and resources.
@@ -63,6 +64,7 @@ def __init__(self, config, db_con):
         self.db_con = db_con
 
         self.cuckoo = None
+        self.cortex = None
 
         # initialise and validate configuration
         self.config_options = {}
@@ -131,6 +133,15 @@ def set_cuckoo_job_tracker(self, cuckoo):
         """
         self.cuckoo = cuckoo
 
+    def set_cortex_job_tracker(self, cortex):
+        """ Set the Cortex job tracker to use for submitting samples to Cortex
+        as well as tracking status.
+
+        @param cortex: the Cortex job tracker to use
+        @type cortex: Cortex
+        """
+        self.cortex = cortex
+
     def get_cuckoo_report(self, sample):
         """ Get the samples cuckoo_report or submit the sample for analysis by
             Cuckoo.
@@ -176,12 +187,50 @@ def get_knowntools_report(self, sample):
         """
         return Knowntools(sample, self.db_con).get_report()
 
-    def get_cortextools_report(self, sample):
-        """ Get a Cortextools report on the sample.
+    def get_cortex_report(self, sample):
+        """ Get the sample's Cortex report.
+
+        @returns: CortexReport or None if a previous analysis attempt has
+                  already failed.
+        """
+        if sample.cortex_failed:
+            return None
 
-        @returns: CortextoolsReport
+        report = sample.cortex_report
+        if report is None:
+            # here we synthesize the main CortexReport as a (mostly) empty
+            # proxy and attach it to the sample. Since the report consists of
+            # potentially multiple subreports of Cortex analyzers, the report
+            # may request submission to an actual analyzer through an
+            # exception when accessing certain properties.
+            report = CortexReport()
+            sample.register_cortex_report(report)
+
+        return report
+
+    def submit_to_cortex(self, sample, analyzer):
+        """ Submit the sample to an actual Cortex analyzer to augment the
+        report.
+
+        @param sample: The sample to submit to Cortex.
+        @type sample: Sample
+        @param analyzer: The Cortex analyzer to submit to.
+        @type analyzer: subclass of CortexAnalyzer
+        @returns: None if submit failed
+        @raises PeekabooAnalysisDeferred: if successfully submitted to abort
+                                          ruleset run until result has been
+                                          retrieved.
         """
-        return Cortextools(sample).get_report()
+        logger.debug("Submitting %s to Cortex", sample.submit_path)
+        try:
+            job_id = self.cortex.submit(sample, analyzer)
+        except CortexSubmitFailedException as failed:
+            logger.error("Submit to Cortex failed: %s", failed)
+            return None
+
+        logger.info('Sample submitted to Cortex. Job ID: %s. '
+                    'Sample: %s', job_id, sample)
+        raise PeekabooAnalysisDeferred()
 
 
 class KnownRule(Rule):
@@ -591,6 +640,10 @@ def get_config(self):
             # attempting anything illegal
             try:
                 parsed_expression.eval(context=context)
+            except CortexAnalyzerReportMissingException:
+                # This exception tells us that CortexReport knows the analyzer
+                # and wants a job submitted. So all is well.
+                pass
             except IdentifierMissingException as missing:
                 # our dummy context provides everything we would provide at
                 # runtime as well, so any missing identifier is an error at
@@ -623,6 +676,55 @@ def uses_cuckoo(self):
         class variable with a dynamic determination. """
         return self.uses_identifier("cuckooreport")
 
+    @property
+    def uses_cortex(self):
+        """ Tells if any expression uses the Cortex report. Overrides base
+        class variable with a dynamic determination. """
+        return self.uses_identifier("cortexreport")
+
+    def resolve_identifier(self, identifier, context, sample):
+        """ Resolves a missing identifer into an object.
+
+        @param identifer: Name of identifer to resolve.
+        @type identifier: string
+        @returns: object or None if identifier is unknown.
+        """
+        if identifier == "cuckooreport":
+            logger.debug("Expression requests cuckoo report")
+            value = self.get_cuckoo_report(sample)
+            if value is None:
+                return self.result(
+                    Result.failed,
+                    _("Evaluation of expression couldn't get cuckoo "
+                      "report."),
+                    False)
+        elif identifier == "olereport":
+            logger.debug("Expression requests oletools report")
+            value = self.get_oletools_report(sample)
+        elif identifier == "filereport":
+            logger.debug("Expression requests filetools report")
+            value = self.get_filetools_report(sample)
+        elif identifier == "knownreport":
+            logger.debug("Expression requests knowntools report")
+            value = self.get_knowntools_report(sample)
+        elif identifier == "cortexreport":
+            logger.debug("Expression requests cortex report")
+            value = self.get_cortex_report(sample)
+            if value is None:
+                return self.result(
+                    Result.failed,
+                    _("Evaluation of expression couldn't get Cortex "
+                      "report."),
+                    False)
+        # elif here for other identifiers
+        else:
+            return self.result(
+                Result.failed,
+                _("Evaluation of expression uses undefined identifier."), False)
+
+        context['variables'][identifier] = value
+        return None
+
     def evaluate(self, sample):
         """ Match what rules report against our known result status names. """
         for ruleno, expression in enumerate(self.expressions):
@@ -632,42 +734,32 @@ def evaluate(self, sample):
             # retry until expression evaluation doesn't throw exceptions any
             # more
             while True:
+                identifier = None
+                cortex_analyzer = None
                 try:
                     result = expression.eval(context=context)
                     break
                 except IdentifierMissingException as missing:
                     identifier = missing.name
-
-                if identifier == "cuckooreport":
-                    logger.debug("Expression requests cuckoo report")
-                    value = self.get_cuckoo_report(sample)
-                    if value is None:
-                        return self.result(
-                            Result.failed,
-                            _("Evaluation of expression couldn't get cuckoo "
-                              "report."),
-                            False)
-                elif identifier == "olereport":
-                    logger.debug("Expression requests oletools report")
-                    value = self.get_oletools_report(sample)
-                elif identifier == "filereport":
-                    logger.debug("Expression requests filetools report")
-                    value = self.get_filetools_report(sample)
-                elif identifier == "knownreport":
-                    logger.debug("Expression requests knowntools report")
-                    value = self.get_knowntools_report(sample)
-                elif identifier == "cortexreport":
-                    logger.debug("Expression requests cortextools report")
-                    value = self.get_cortextools_report(sample)
-                # elif here for other identifiers
-                else:
+                except CortexAnalyzerReportMissingException as missing:
+                    cortex_analyzer = missing.analyzer
+
+                if identifier is not None:
+                    result = self.resolve_identifier(
+                        identifier, context, sample)
+                    if result is not None:
+                        return result
+
+                if cortex_analyzer is not None:
+                    self.submit_to_cortex(sample, cortex_analyzer)
+                    # submission either raises an exception or has failed, so
+                    # getting here is an error
                     return self.result(
                         Result.failed,
-                        _("Evaluation of expression uses undefined "
-                          "identifier."),
+                        _("Evaluation of expression failed to submit Cortex "
+                          "analysis."),
                         False)
 
-                context['variables'][identifier] = value
                 # beware: here we intentionally loop on through for retry
 
             # our implication returns None if expression did not match