Autogenerate 'list-rules' output and autoinject rules docs in README …

…file (DataDog#133)
H4dr1en · Feb 2, 2023 · 414db84 · 414db84
1 parent 7bccb25
commit 414db84
Show file tree

Hide file tree

Showing 41 changed files with 20,664 additions and 20,479 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -9,4 +9,9 @@ repos:
   - id: typecheck
     name: Type check Python code
     language: system
-    entry: make type-check
+    entry: make type-check
+
+  - id: docs
+    name: Autogenerate rules documentation
+    language: system
+    entry: make docs
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: test test-semgrep-rules test-metadata-rules test-core
+.PHONY: test test-semgrep-rules test-metadata-rules test-core docs
 
 test: test-semgrep-rules test-metadata-rules test-core coverage-report
 
@@ -21,3 +21,6 @@ test-core:
 coverage-report:
 	coverage combine .coverage_metadata .coverage_core
 	coverage report
+
+docs:
+	python scripts/generate-rules-docs.py README.md
diff --git a/README.md b/README.md
@@ -67,30 +67,57 @@ GuardDog comes with 2 types of heuristics:
 
 * [**Package metadata heuristics**](https://github.com/DataDog/guarddog/tree/main/guarddog/analyzer/metadata): Python heuristics running against the package metadata on PyPI.
 
-### Source code heuristics
-
-
-|                                                                         **Heuristic**                                                                         |                                                                            **Description**                                                                            |
-|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
-|                       [Command overwrite](https://github.com/DataDog/guarddog/blob/main/guarddog/analyzer/sourcecode/cmd-overwrite.yml)                       | The `install` command is overwritten in the `setup.py` file, indicating that a system command is automatically run when installing the package through `pip install`. |
-|            [Dynamic execution of base64-encoded data](https://github.com/DataDog/guarddog/blob/main/guarddog/analyzer/sourcecode/exec-base64.yml)             |                                          A base64-encoded string ends up being executed by a function like `exec` or `eval`                                           |
-|            [Download of an executable to disk](https://github.com/DataDog/guarddog/blob/main/guarddog/analyzer/sourcecode/download-executable.yml)            |                                          Data coming from an HTTP response ends up being written to disk and made executable                                          |
-| [Exfiltration of sensitive data to a remote server](https://github.com/DataDog/guarddog/blob/main/guarddog/analyzer/sourcecode/exfiltrate-sensitive-data.yml) |                                            Sensitive data from the environment ends up being sent through an HTTP request                                             |
-|                 [Code execution in `setup.py`](https://github.com/DataDog/guarddog/blob/main/guarddog/analyzer/sourcecode/code-execution.yml)                 |                                                 Code in `setup.py` executes code dynamically or starts a new process                                                  |
-|                    [Unusual domain extension](https://github.com/DataDog/guarddog/blob/main/guarddog/analyzer/sourcecode/shady-links.yml)                     |                                      Usage of a domain name with an extension frequently used by malware (e.g. `.xyz` or `.top`)                                      |
-|        [Dynamic execution of hidden data from an image](https://github.com/DataDog/guarddog/blob/main/guarddog/analyzer/sourcecode/steganography.yml)         |                                           The package uses steganography to extract a payload from an image and execute it                                            |
-|               [Use of a common obfuscation method](https://github.com/DataDog/guarddog/blob/main/guarddog/analyzer/sourcecode/obfuscation.yml)                |                            The package uses an obfuscation method commonly used by malware, such as running `eval` on hexadecimal strings                             |
-|           [Silent execution of a process](https://github.com/DataDog/guarddog/blob/main/guarddog/analyzer/sourcecode/silent-process-execution.yml)            |                                                     The package spawns a subprocess without capturing its output                                                      |
-
-### Package metadata heuristics
-
-|                                  **Heuristic**                                  |                                                                                                                                                                                                   **Description**                                                                                                                                                                                                   |
-|:-------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
-|                                  Typosquatting                                  |                                                                                                                                                                             Package has a name close to one of the top 5k PyPI packages                                                                                                                                                                             |
-|                Potentially compromised maintainer e-mail domain                 | Maintainer e-mail address is associated to a domain that was re-registered later than the last package release. This can be an indicator that this is a custom domain that expired, and was leveraged by an attacker to compromise the package owner's PyPI account. See [here](https://therecord.media/thousands-of-npm-accounts-use-email-addresses-with-expired-domains) for a description of the issue for npm. |
-|                            Empty package description                            |                                                                                                                                                                                      Package has an empty description of PyPI                                                                                                                                                                                       |
-|                                  Release 0.0.0                                  |                                                                                                                                                                               Package has its latest release set to `0.0.0` or `0.0`                                                                                                                                                                                |
-| Source code discrepancy between repository and release artifact  (experimental) |                                                                                                         The release artifact (e.g. PyPI package archive) has at least one file that differs from the original GitHub repository. This can indicate that the package release artifacts have been backdoored                                                                                                          |
+<!-- BEGIN_RULE_LIST -->
+### PyPI
+
+Source code heuristics:
+
+| **Heuristic** | **Description** |
+|:-------------:|:---------------:|
+| shady-links | Identify when a package contains an URL to a domain with a suspicious extension |
+| obfuscation | Identify when a package uses a common obfuscation method often used by malware |
+| exfiltrate-sensitive-data | Identify when a package reads and exfiltrates sensitive data from the local system |
+| download-executable | Identify when a package downloads and makes executable a remote binary |
+| exec-base64 | Identify when a package dynamically executes base64-encoded code |
+| silent-process-execution | Identify when a package silently executes an executable |
+| steganography | Identify when a package retrieves hidden data from an image and executes it |
+| code-execution | Identify when an OS command is executed in the setup.py file |
+| cmd-overwrite | Identify when the 'install' command is overwritten in setup.py, indicating a piece of code automatically running when the package is installed |
+
+Metadata heuristics:
+
+| **Heuristic** | **Description** |
+|:-------------:|:---------------:|
+| empty_information | Identify packages with an empty description field |
+| release_zero | Identify packages with an release version that's 0.0 or 0.0.0 |
+| typosquatting | Identify packages that are named closely to an highly popular package |
+| potentially_compromised_email_domain | Identify when a package maintainer e-mail domain (and therefore package manager account) might have been compromised |
+| repository_integrity_mismatch | Identify packages with a linked GitHub repository where the package has extra unexpected files |
+
+
+### npm
+
+Source code heuristics:
+
+| **Heuristic** | **Description** |
+|:-------------:|:---------------:|
+| npm-serialize-environment | Identify when a package serializes 'process.env' to exfiltrate environment variables |
+| npm-silent-process-execution | Identify when a package silently executes an executable |
+| shady-links | Identify when a package contains an URL to a domain with a suspicious extension |
+| npm-exec-base64 | Identify when a package dynamically executes code through 'eval' |
+| npm-install-script | Identify when a package has a pre or post-install script automatically running commands |
+
+Metadata heuristics:
+
+| **Heuristic** | **Description** |
+|:-------------:|:---------------:|
+| empty_information | Identify packages with an empty description field |
+| release_zero | Identify packages with an release version that's 0.0 or 0.0.0 |
+| potentially_compromised_email_domain | Identify when a package maintainer e-mail domain (and therefore package manager account) might have been compromised |
+| typosquatting | Identify packages that are named closely to an highly popular package |
+
+
+<!-- END_RULE_LIST -->
 
 ## Development
 

diff --git a/guarddog/__init__.py b/guarddog/__init__.py
@@ -1,2 +1,2 @@
-from guarddog.scanners.pypi_package_scanner import PypiPackageScanner  # NOQA
 from guarddog.scanners.npm_package_scanner import NPMPackageScanner  # NOQA
+from guarddog.scanners.pypi_package_scanner import PypiPackageScanner  # NOQA
diff --git a/guarddog/analyzer/metadata/__init__.py b/guarddog/analyzer/metadata/__init__.py
@@ -5,7 +5,7 @@
 
 
 def get_metadata_detectors(ecosystem: ECOSYSTEM) -> dict[str, Detector]:
-    match(ecosystem):
+    match (ecosystem):
         case ECOSYSTEM.PYPI:
             return PYPI_METADATA_RULES
         case ECOSYSTEM.NPM:

diff --git a/guarddog/analyzer/metadata/detector.py b/guarddog/analyzer/metadata/detector.py
@@ -3,14 +3,20 @@
 
 
 class Detector:
-
     RULE_NAME = ""
 
-    def __init__(self) -> None:
-        pass
+    def __init__(self, name: str, description: str) -> None:
+        self.name = name
+        self.description = description
 
     # returns (ruleMatches, message)
     @abstractmethod
     def detect(self, package_info, path: Optional[str] = None, name: Optional[str] = None,
                version: Optional[str] = None) -> tuple[bool, Optional[str]]:
         pass  # pragma: no cover
+
+    def get_name(self) -> str:
+        return self.name
+
+    def get_description(self) -> str:
+        return self.description
diff --git a/guarddog/analyzer/metadata/empty_information.py b/guarddog/analyzer/metadata/empty_information.py
@@ -1,7 +1,3 @@
-""" Empty Information Detector
-
-Detects if a package contains an empty description
-"""
 from abc import abstractmethod
 from typing import Optional
 
@@ -15,7 +11,12 @@ class EmptyInfoDetector(Detector):
     Such situation might be the marker of a low quality package."""
 
     MESSAGE_TEMPLATE = "This package has an empty description on %s"
-    RULE_NAME = "empty_information"
+
+    def __init__(self):
+        super().__init__(
+            name="empty_information",
+            description="Identify packages with an empty description field"
+        )
 
     @abstractmethod
     def detect(self, package_info, path: Optional[str] = None, name: Optional[str] = None,

diff --git a/guarddog/analyzer/metadata/npm/__init__.py b/guarddog/analyzer/metadata/npm/__init__.py
@@ -13,6 +13,6 @@
     NPMTyposquatDetector
 ]
 
-for cls in classes:
-    parent = cls.__base__
-    NPM_METADATA_RULES[parent.RULE_NAME] = cls()  # type: ignore
+for detectorClass in classes:
+    detectorInstance = detectorClass()  # type: ignore
+    NPM_METADATA_RULES[detectorInstance.get_name()] = detectorInstance
diff --git a/guarddog/analyzer/metadata/npm/potentially_compromised_email_domain.py b/guarddog/analyzer/metadata/npm/potentially_compromised_email_domain.py
@@ -12,10 +12,8 @@
 
 
 class NPMPotentiallyCompromisedEmailDomainDetector(PotentiallyCompromisedEmailDomainDetector):
-
     def __init__(self):
-        super().__init__()
-        self.ecosystem = "npm"
+        super().__init__("npm")
 
     def get_email_addresses(self, package_info: dict) -> list[str]:
         return list(map(lambda x: x["email"], package_info["maintainers"]))

diff --git a/guarddog/analyzer/metadata/npm/release_zero.py b/guarddog/analyzer/metadata/npm/release_zero.py
@@ -11,5 +11,5 @@ class NPMReleaseZeroDetector(ReleaseZeroDetector):
 
     def detect(self, package_info, path: Optional[str] = None, name: Optional[str] = None,
                version: Optional[str] = None) -> tuple[bool, str]:
-        return package_info["dist-tags"]["latest"] in ["0.0.0", "0.0", "0"],\
+        return package_info["dist-tags"]["latest"] in ["0.0.0", "0.0", "0"], \
             ReleaseZeroDetector.MESSAGE_TEMPLATE % package_info["dist-tags"]["latest"]
diff --git a/guarddog/analyzer/metadata/potentially_compromised_email_domain.py b/guarddog/analyzer/metadata/potentially_compromised_email_domain.py
@@ -1,7 +1,3 @@
-""" Compromised Email Detector
-
-Detects if a maintainer's email domain might have been compromised.
-"""
 from abc import abstractmethod
 from datetime import datetime
 from typing import Optional
@@ -12,13 +8,14 @@
 
 
 class PotentiallyCompromisedEmailDomainDetector(Detector):
-    """This heuristic detects whether the maintainer email address has an outdated domain that anyone could acquire.
-    This could lead to the package being overtaken by malicious actors."""
-    RULE_NAME = "potentially_compromised_email_domain"
-
-    def __init__(self):
-        super().__init__()
-        self.ecosystem = ""
+    # The name of the rule is dependent on the ecosystem and is provided by the implementing subclasses
+    def __init__(self, ecosystem: str):
+        super().__init__(
+            name="potentially_compromised_email_domain",
+            description="Identify when a package maintainer e-mail domain (and therefore package manager account) "
+                        "might have been compromised",
+        )
+        self.ecosystem = ecosystem
 
     def _get_domain_creation_date(self, email_domain) -> tuple[Optional[datetime], bool]:
         """

diff --git a/guarddog/analyzer/metadata/pypi/__init__.py b/guarddog/analyzer/metadata/pypi/__init__.py
@@ -15,6 +15,6 @@
     PypiIntegrityMismatchDetector
 ]
 
-for cls in classes:
-    parent = cls.__base__
-    PYPI_METADATA_RULES[parent.RULE_NAME] = cls()  # type: ignore
+for detectorClass in classes:
+    detectorInstance = detectorClass()  # type: ignore
+    PYPI_METADATA_RULES[detectorInstance.get_name()] = detectorInstance
diff --git a/guarddog/analyzer/metadata/pypi/potentially_compromised_email_domain.py b/guarddog/analyzer/metadata/pypi/potentially_compromised_email_domain.py
@@ -5,17 +5,16 @@
 
 from datetime import datetime
 from typing import Optional
+
 from dateutil import parser
 from packaging import version
 
 from guarddog.analyzer.metadata.potentially_compromised_email_domain import PotentiallyCompromisedEmailDomainDetector
 
 
 class PypiPotentiallyCompromisedEmailDomainDetector(PotentiallyCompromisedEmailDomainDetector):
-
     def __init__(self):
-        super().__init__()
-        self.ecosystem = "Pypi"
+        super().__init__("pypi")
 
     def get_email_addresses(self, package_info: dict) -> list[str]:
         author_email = package_info["info"]["author_email"]

diff --git a/guarddog/analyzer/metadata/release_zero.py b/guarddog/analyzer/metadata/release_zero.py
@@ -1,13 +1,13 @@
-""" Empty Information Detector
-
-Detects when a package has its latest release version to 0.0.0
-"""
-
 from guarddog.analyzer.metadata.detector import Detector
 
 
 class ReleaseZeroDetector(Detector):
     """This heuristic detects if the latest release of this package is version 0."""
 
     MESSAGE_TEMPLATE = "The package has its latest release version to %s"
-    RULE_NAME = "release_zero"
+
+    def __init__(self):
+        super().__init__(
+            name="release_zero",
+            description="Identify packages with an release version that's 0.0 or 0.0.0"
+        )
diff --git a/guarddog/analyzer/metadata/repository_integrity_mismatch.py b/guarddog/analyzer/metadata/repository_integrity_mismatch.py
@@ -1,7 +1,3 @@
-""" Empty Information Detector
-
-Detects if a package contains an empty description
-"""
 from abc import abstractmethod
 from typing import Optional
 
@@ -12,6 +8,12 @@ class IntegrityMismatch(Detector):
     """This package contains files that have been tampered with between the source repository and the package CDN"""
     RULE_NAME = "repository_integrity_mismatch"
 
+    def __init__(self):
+        super().__init__(
+            name="repository_integrity_mismatch",
+            description="Identify packages with a linked GitHub repository where the package has extra unexpected files"
+        )
+
     @abstractmethod
     def detect(self, package_info, path: Optional[str] = None, name: Optional[str] = None,
                version: Optional[str] = None) -> tuple[bool, str]: