[cli] Local diff workflow support

- Extend the `CodeChecker parse` command with an extra baseline output type which can be used to generate a baseline file which will contain report hashes for legacy reports. - Extend the `CodeChecker cmd diff` to support baseline files. - Add test cases. - Extend the documentation with the recommended usage of this workflow.
wandec · Aug 17, 2021 · d162500 · d162500
1 parent 35008b9
commit d162500
Show file tree

Hide file tree

Showing 11 changed files with 625 additions and 145 deletions.
diff --git a/analyzer/codechecker_analyzer/cmd/parse.py b/analyzer/codechecker_analyzer/cmd/parse.py
@@ -26,7 +26,7 @@
 from codechecker_analyzer import analyzer_context, suppress_handler
 
 from codechecker_common import arg, logger, plist_parser, util, cmd_config
-from codechecker_common.output import json as out_json, twodim, \
+from codechecker_common.output import baseline, json as out_json, twodim, \
     codeclimate, gerrit
 from codechecker_common.skiplist_handler import SkipListHandler
 from codechecker_common.source_code_comment_handler import \
@@ -37,7 +37,7 @@
 
 LOG = logger.get_logger('system')
 
-EXPORT_TYPES = ['html', 'json', 'codeclimate', 'gerrit']
+EXPORT_TYPES = ['html', 'json', 'codeclimate', 'gerrit', 'baseline']
 
 _data_files_dir_path = analyzer_context.get_context().data_files_dir_path
 _severity_map_file = os.path.join(_data_files_dir_path, 'config',
@@ -457,7 +457,11 @@ def add_arguments_to_parser(parser):
                                   "For more information see:\n"
                                   "https://github.com/codeclimate/platform/"
                                   "blob/master/spec/analyzers/SPEC.md"
-                                  "#data-types")
+                                  "#data-types\n"
+                                  "'baseline' output can be used to integrate "
+                                  "CodeChecker into your local workflow "
+                                  "without using a CodeChecker server. For "
+                                  "more information see our usage guide.")
 
     output_opts.add_argument('-o', '--output',
                              dest="output_path",
@@ -639,6 +643,9 @@ def _parse_convert_reports(
             report.trim_path_prefixes(trim_path_prefixes)
 
     number_of_reports = len(all_reports)
+    if out_format == "baseline":
+        return (baseline.convert(all_reports), number_of_reports)
+
     if out_format == "codeclimate":
         return (codeclimate.convert(all_reports, severity_map),
                 number_of_reports)
@@ -693,11 +700,6 @@ def _generate_json_output(
         output_text = json.dumps(reports)
 
         if output_path:
-            output_path = os.path.abspath(output_path)
-
-            if not os.path.exists(output_path):
-                os.mkdir(output_path)
-
             output_file_path = os.path.join(output_path, 'reports.json')
             with open(output_file_path, mode='w', encoding='utf-8',
                       errors="ignore") as output_f:
@@ -793,7 +795,20 @@ def main(args):
     if 'output_path' in args:
         output_path = os.path.abspath(args.output_path)
 
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+
     if export:
+        if export == 'baseline':
+            report_hashes, number_of_reports = _parse_convert_reports(
+                args.input, export, context.severity_map, trim_path_prefixes,
+                skip_handler)
+
+            if output_path:
+                baseline.write(output_path, report_hashes)
+
+            sys.exit(2 if number_of_reports else 0)
+
         # The HTML part will be handled separately below.
         if export != 'html':
             sys.exit(_generate_json_output(

diff --git a/analyzer/tests/functional/analyze_and_parse/test_analyze_and_parse.py b/analyzer/tests/functional/analyze_and_parse/test_analyze_and_parse.py
@@ -24,6 +24,8 @@
 from libtest import project
 from libtest.codechecker import call_command
 
+from codechecker_common.output import baseline
+
 
 class AnalyzeParseTestCaseMeta(type):
     def __new__(mcs, name, bases, test_dict):
@@ -544,3 +546,47 @@ def test_html_export_exit_code(self):
         out, _, result = call_command(extract_cmd, cwd=self.test_dir,
                                       env=self.env)
         self.assertEqual(result, 0, "Parsing should not found any issue.")
+
+    def test_baseline_output(self):
+        """ Test parse baseline output. """
+        output_path = self.test_workspaces['OUTPUT']
+        out_file_path = os.path.join(output_path, "reports.baseline")
+
+        # Analyze the first project.
+        test_project_notes = os.path.join(
+            self.test_workspaces['NORMAL'], "test_files", "notes")
+
+        extract_cmd = ['CodeChecker', 'parse',
+                       "-e", "baseline",
+                       "-o", output_path,
+                       test_project_notes,
+                       '--trim-path-prefix', test_project_notes]
+
+        _, _, result = call_command(
+            extract_cmd, cwd=self.test_dir, env=self.env)
+        self.assertEqual(result, 2, "Parsing not found any issue.")
+
+        report_hashes = baseline.get_report_hashes([out_file_path])
+        self.assertEqual(
+            report_hashes, {'3d15184f38c5fa57e479b744fe3f5035'})
+
+        # Analyze the second project and see whether the baseline file is
+        # merged.
+        test_project_macros = os.path.join(
+            self.test_workspaces['NORMAL'], "test_files", "macros")
+
+        extract_cmd = ['CodeChecker', 'parse',
+                       "-e", "baseline",
+                       "-o", output_path,
+                       test_project_macros,
+                       '--trim-path-prefix', test_project_macros]
+
+        _, _, result = call_command(
+            extract_cmd, cwd=self.test_dir, env=self.env)
+        self.assertEqual(result, 2, "Parsing not found any issue.")
+
+        report_hashes = baseline.get_report_hashes([out_file_path])
+        self.assertEqual(
+            report_hashes, {
+                '3d15184f38c5fa57e479b744fe3f5035',
+                'f8fbc46cc5afbb056d92bd3d3d702781'})
diff --git a/codechecker_common/output/baseline.py b/codechecker_common/output/baseline.py
@@ -0,0 +1,76 @@
+# -------------------------------------------------------------------------
+#
+#  Part of the CodeChecker project, under the Apache License v2.0 with
+#  LLVM Exceptions. See LICENSE for license information.
+#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# -------------------------------------------------------------------------
+""" CodeChecker baseline output helpers. """
+
+from io import TextIOWrapper
+import os
+from typing import Iterable, List, Set
+
+from codechecker_common import logger
+from codechecker_common.report import Report
+
+
+LOG = logger.get_logger('system')
+
+
+def __get_report_hashes(f: TextIOWrapper) -> List[str]:
+    """ Get report hashes from the given file. """
+    return [h for h in f.readlines() if h]
+
+
+def get_report_hashes(
+    baseline_file_paths: Iterable[str]
+) -> Set[str]:
+    """ Get uniqued hashes from baseline files. """
+    report_hashes = set()
+    for file_path in baseline_file_paths:
+        with open(file_path, mode='r', encoding='utf-8', errors="ignore") as f:
+            report_hashes.update(__get_report_hashes(f))
+
+    return report_hashes
+
+
+def convert(reports: Iterable[Report]) -> List[str]:
+    """ Convert the given reports to CodeChecker baseline format.
+
+    Returns a list of sorted unique report hashes.
+    """
+    return sorted(set(r.report_hash for r in reports))
+
+
+def write(output_dir_path: str, report_hashes: Iterable[str]):
+    """ Create a new baseline file or extend an existing one with the given
+    report hashes in the given output directory. It will remove the duplicates
+    and also sort the report hashes before writing it to a file.
+    """
+    file_path = os.path.join(output_dir_path, 'reports.baseline')
+    with open(file_path, mode='a+', encoding='utf-8', errors="ignore") as f:
+        f.seek(0)
+        old_report_hashes = __get_report_hashes(f)
+        new_report_hashes = set(report_hashes) - set(old_report_hashes)
+
+        if not new_report_hashes:
+            LOG.info("Baseline file (%s) is up-to-date.", file_path)
+            return
+
+        if old_report_hashes:
+            LOG.info("Merging existing baseline file: %s", file_path)
+        else:
+            LOG.info("Creating new baseline file: %s", file_path)
+
+        LOG.info("Total number of old report hashes: %d",
+                 len(old_report_hashes))
+        LOG.info("Total number of new report hashes: %d",
+                 len(new_report_hashes))
+
+        LOG.debug("New report hashes: %s", sorted(new_report_hashes))
+
+        f.seek(0)
+        f.truncate()
+        f.write("\n".join(sorted(
+            set([*old_report_hashes, *report_hashes]))))
diff --git a/docs/analyzer/user_guide.md b/docs/analyzer/user_guide.md
@@ -1577,10 +1577,11 @@ Statistics analysis feature arguments:
   </summary>
 
 ```
-Usage: CodeChecker parse [-h] [--config CONFIG_FILE] [-t {plist}]
-                         [-e {html,json,codeclimate,gerrit}] [-o OUTPUT_PATH]
-                         [--suppress SUPPRESS] [--export-source-suppress]
-                         [--print-steps] [-i SKIPFILE]
+usage: CodeChecker parse [-h] [--config CONFIG_FILE] [-t {plist}]
+                         [-e {html,json,codeclimate,gerrit,baseline}]
+                         [-o OUTPUT_PATH] [--suppress SUPPRESS]
+                         [--export-source-suppress] [--print-steps]
+                         [-i SKIPFILE]
                          [--trim-path-prefix [TRIM_PATH_PREFIX [TRIM_PATH_PREFIX ...]]]
                          [--review-status [REVIEW_STATUS [REVIEW_STATUS ...]]]
                          [--verbose {info,debug_analyzer,debug}]
@@ -1643,12 +1644,16 @@ optional arguments:
                         Set verbosity level.
 
 export arguments:
-  -e {html,json,codeclimate,gerrit}, --export {html,json,codeclimate,gerrit}
+  -e {html,json,codeclimate,gerrit,baseline}, --export {html,json,codeclimate,gerrit,baseline}
                         Specify extra output format type.
                         'codeclimate' format can be used for Code Climate and
                         for GitLab integration. For more information see:
                         https://github.com/codeclimate/platform/blob/master/sp
-                        ec/analyzers/SPEC.md#data-types (default: None)
+                        ec/analyzers/SPEC.md#data-types
+                        'baseline' output can be used to integrate CodeChecker
+                        into your local workflow without using a CodeChecker
+                        server. For more information see our usage guide.
+                        (default: None)
   -o OUTPUT_PATH, --output OUTPUT_PATH
                         Store the output in the given folder.
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -41,6 +41,7 @@ It invokes Clang Static Analyzer and Clang-Tidy tools to analyze your code.
       - [Alternative 2: Store each analysis in a new run](#storing-new-runs)
     - [Gerrit Integration](#gerrit-integration)
     - [Setting up user authentication](authentication)
+  - [Step 9: Integrate CodeChecker into your local workflow](#step-9)
   - [Updating CodeChecker to new version](#upgrade)
 - [Unique Report Identifier (RI)](#unique-report-identifier)
   - [Listing and Counting Reports](#listing-reports)
@@ -754,6 +755,24 @@ guide.
 You can set up authentication for your server and (web,command line) clients
 as described in the [Authentication Guide](web/authentication.md).
 
+
+## Step 9: Integrate CodeChecker into your local workflow <a name="step-9"></a>
+If you want to use CodeChecker in your project but you don't want to run a
+CodeChecker server and to fix every reports found by CodeChecker on the first
+time (legacy findings) you can do the following steps:
+1. Analyze your project to a report directory (e.g.: `./reports`). For more
+information see [Step 2](#step-2).
+2. Create a baseline file from the reports which contains the legacy findings:
+`CodeChecker parse ./reports -e baseline -o .`. It is recommended to store
+this baseline file (`reports.baseline`) in your repository.
+3. On source code changes after your project is re-analyzed use the
+CodeChecker diff command to get the new reports:
+`CodeChecker cmd diff -b ./reports.baseline -n ./reports --new`
+4. On configuration changes (new checkers / options are enabled / disabled,
+new CodeChecker / clang version is used, etc.) re-generate the baseline file
+(step 1-2).
+
+
 ## Updating CodeChecker to new version <a name="upgrade"></a>
 If a new CodeChecker release is available it might be possible that there are
 some database changes compared to the previous release. If you run into

diff --git a/docs/web/user_guide.md b/docs/web/user_guide.md
@@ -1092,26 +1092,29 @@ optional arguments:
                         The 'base' (left) side of the difference: these
                         analysis runs are used as the initial state in the
                         comparison. The parameter can be multiple run names
-                        (on the remote server) or multiple local report
-                        directories (result of the analyze command). In case
-                        of run name the the basename can contain * quantifiers
-                        which matches any number of characters (zero or more).
-                        So if you have run-a-1, run-a-2 and run-b-1 then
-                        "run-a*" selects the first two. In case of run names
-                        tag labels can also be used separated by a colon (:)
-                        character: "run_name:tag_name".
-  -n NEW_RUNS [NEW_RUNS ...], --newname NEW_RUNS [NEW_RUNS ...]
-                        The 'new' (right) side of the difference: these
-                        analysis runs are compared to the -b/--basename runs.
-                        The parameter can be multiple run names (on the remote
-                        server) or multiple local report directories (result
-                        of the analyze command). In case of run name the
-                        newname can contain * quantifiers which matches any
+                        (on the remote server), multiple local report
+                        directories (result of the analyze command) or
+                        baseline files (generated by the 'CodeChecker parse -e
+                        baseline' command). In case of run name the the
+                        basename can contain * quantifiers which matches any
                         number of characters (zero or more). So if you have
                         run-a-1, run-a-2 and run-b-1 then "run-a*" selects the
                         first two. In case of run names tag labels can also be
                         used separated by a colon (:) character:
                         "run_name:tag_name".
+  -n NEW_RUNS [NEW_RUNS ...], --newname NEW_RUNS [NEW_RUNS ...]
+                        The 'new' (right) side of the difference: these
+                        analysis runs are compared to the -b/--basename runs.
+                        The parameter can be multiple run names (on the remote
+                        server), multiple local report directories (result of
+                        the analyze command) or baseline files (generated by
+                        the 'CodeChecker parse -e baseline' command). In case
+                        of run name the newname can contain * quantifiers
+                        which matches any number of characters (zero or more).
+                        So if you have run-a-1, run-a-2 and run-b-1 then
+                        "run-a*" selects the first two. In case of run names
+                        tag labels can also be used separated by a colon (:)
+                        character: "run_name:tag_name".
   -o {plaintext,rows,table,csv,json,html,gerrit,codeclimate} [{plaintext,rows,table,csv,json,html,gerrit,codeclimate} ...], --output {plaintext,rows,table,csv,json,html,gerrit,codeclimate} [{plaintext,rows,table,csv,json,html,gerrit,codeclimate} ...]
                         The output format(s) to use in showing the data.
                         - html: multiple html files will be generated in the
@@ -1317,6 +1320,10 @@ exist in the remote run 'run1' but appear in the local report directory:
 Compare two runs and show results that exist in both runs and filter results
 by multiple severity values:
     CodeChecker cmd diff -b run1 -n run2 --unresolved --severity high medium
+
+Compare a baseline file (generated by the 'CodeChecker parse -e baseline'
+command) and a local report directory and show new results:
+    CodeChecker cmd diff -b /reports.baseline -n /my_report_dir --new
 ```
 </details>
 

diff --git a/web/client/codechecker_client/cmd/cmd.py b/web/client/codechecker_client/cmd/cmd.py
@@ -490,16 +490,17 @@ def __register_diff(parser):
                         help="The 'base' (left) side of the difference: these "
                              "analysis runs are used as the initial state in "
                              "the comparison. The parameter can be multiple "
-                             "run names (on the remote server) or multiple "
+                             "run names (on the remote server), multiple "
                              "local report directories (result of the analyze "
-                             "command). In case of run name the the basename "
-                             "can contain * quantifiers which matches any "
-                             "number of characters (zero or more). So if you "
-                             "have run-a-1, run-a-2 and run-b-1 then "
-                             "\"run-a*\" selects the first two. In case of "
-                             "run names tag labels can also be used separated "
-                             "by a colon (:) character: "
-                             "\"run_name:tag_name\".")
+                             "command) or baseline files (generated by the "
+                             "'CodeChecker parse -e baseline' command). In "
+                             "case of run name the the basename can contain * "
+                             "quantifiers which matches any number of "
+                             "characters (zero or more). So if you have "
+                             "run-a-1, run-a-2 and run-b-1 then \"run-a*\" "
+                             "selects the first two. In case of run names tag "
+                             "labels can also be used separated by a colon "
+                             "(:) character: \"run_name:tag_name\".")
 
     parser.add_argument('-n', '--newname',
                         type=str,
@@ -510,11 +511,13 @@ def __register_diff(parser):
                         help="The 'new' (right) side of the difference: these "
                              "analysis runs are compared to the -b/--basename "
                              "runs. The parameter can be multiple run names "
-                             "(on the remote server) or multiple local "
+                             "(on the remote server), multiple local "
                              "report directories (result of the analyze "
-                             "command). In case of run name the newname can "
-                             "contain * quantifiers which matches any number "
-                             "of characters (zero or more). So if you have "
+                             "command) or baseline files (generated by the "
+                             "'CodeChecker parse -e baseline' command). In "
+                             "case of run name the newname can contain * "
+                             "quantifiers which matches any number of "
+                             "characters (zero or more). So if you have "
                              "run-a-1, run-a-2 and run-b-1 then "
                              "\"run-a*\" selects the first two. In case of "
                              "run names tag labels can also be used separated "
@@ -1342,7 +1345,11 @@ def add_arguments_to_parser(parser):
 
 Compare two runs and show results that exist in both runs and filter results
 by multiple severity values:
-    CodeChecker cmd diff -b run1 -n run2 --unresolved --severity high medium'''
+    CodeChecker cmd diff -b run1 -n run2 --unresolved --severity high medium
+
+Compare a baseline file (generated by the 'CodeChecker parse -e baseline'
+command) and a local report directory and show new results:
+    CodeChecker cmd diff -b /reports.baseline -n /my_report_dir --new'''
     )
     __register_diff(diff)