Skip to content

Commit

Permalink
Pypi to Github files integrity (DataDog#114)
Browse files Browse the repository at this point in the history
* save commit

* save commit

* save commit

* v0 of missmatch files detector

* update notes

* update hash algorithm

* fix issue in versionning

* ignore egg_info in setup.cfg

* save commit before mistakes are made

* do not rely on github api

* update version choice

* highlight last issue

* ensure we use a proper one

* add analysis of new detector performances + updates

* remove costly piece of the notebook

* u[date notebook

* move best candidate choice from 23% failure to 16%

* fix repo missmatch issue

* add a couple tests

* fix code quality

* add instructions for notebook

* fix requirements

* fix tests

* remove run.py

* add libgit2 to the docker

* add libgit2 to the docker

* add libgit2 to the docker

* apply review comments

* Typo: replace 'missmatch' by 'mismatch' in new heuristic

* Rename detector and add docs

* Add description of new heuristic to README

* fix package name issue

* fix issues

Co-authored-by: Christophe Tafani-Dereeper <[email protected]>
  • Loading branch information
vdeturckheim and christophetd authored Jan 10, 2023
1 parent 06d782e commit 2da2ac9
Show file tree
Hide file tree
Showing 26 changed files with 685 additions and 32 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ FROM python:3.10-alpine3.17 AS builder
LABEL org.opencontainers.image.source="https://github.com/DataDog/guarddog/"
RUN mkdir /app
# gcc and musl-dev needed for the pip install
RUN apk add --update gcc musl-dev g++
RUN apk add --update gcc musl-dev g++ libgit2-dev libffi-dev
ADD . /app
WORKDIR /app
RUN pip install --no-cache-dir -r requirements.txt
Expand Down
13 changes: 7 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,13 @@ GuardDog comes with 2 types of heuristics:

### Package metadata heuristics

| **Heuristic** | **Description** |
|:------------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
| Typosquatting | Package has a name close to one of the top 5k PyPI packages |
| Potentially compromised maintainer e-mail domain | Maintainer e-mail address is associated to a domain that was re-registered later than the last package release. This can be an indicator that this is a custom domain that expired, and was leveraged by an attacker to compromise the package owner's PyPI account. See [here](https://therecord.media/thousands-of-npm-accounts-use-email-addresses-with-expired-domains) for a description of the issue for npm. |
| Empty package description | Package has an empty description of PyPI |
| Release 0.0.0 | Package has its latest release set to `0.0.0` or `0.0` |
| **Heuristic** | **Description** |
|:-------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
| Typosquatting | Package has a name close to one of the top 5k PyPI packages |
| Potentially compromised maintainer e-mail domain | Maintainer e-mail address is associated to a domain that was re-registered later than the last package release. This can be an indicator that this is a custom domain that expired, and was leveraged by an attacker to compromise the package owner's PyPI account. See [here](https://therecord.media/thousands-of-npm-accounts-use-email-addresses-with-expired-domains) for a description of the issue for npm. |
| Empty package description | Package has an empty description of PyPI |
| Release 0.0.0 | Package has its latest release set to `0.0.0` or `0.0` |
| Source code discrepancy between repository and release artifact (experimental) | The release artifact (e.g. PyPI package archive) has at least one file that differs from the original GitHub repository. This can indicate that the package release artifacts have been backdoored |

## Development

Expand Down
10 changes: 6 additions & 4 deletions guarddog/analyzer/analyzer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from pathlib import Path
from typing import Optional

from semgrep.semgrep_main import invoke_semgrep # type: ignore

Expand Down Expand Up @@ -57,7 +58,7 @@ def __init__(self, ecosystem=ECOSYSTEM.PYPI) -> None:
".semgrep_logs",
]

def analyze(self, path, info=None, rules=None) -> dict:
def analyze(self, path, info=None, rules=None, name: Optional[str] = None, version: Optional[str] = None) -> dict:
"""
Analyzes a package in the given path
Expand Down Expand Up @@ -92,7 +93,7 @@ def analyze(self, path, info=None, rules=None) -> dict:
else:
raise Exception(f"{rule} is not a valid rule.")

metadata_results = self.analyze_metadata(path, info, metadata_rules)
metadata_results = self.analyze_metadata(path, info, metadata_rules, name, version)
sourcecode_results = self.analyze_sourcecode(path, sourcecode_rules)

# Concatenate dictionaries together
Expand All @@ -102,7 +103,8 @@ def analyze(self, path, info=None, rules=None) -> dict:

return {"issues": issues, "errors": errors, "results": results, "path": path}

def analyze_metadata(self, path: str, info, rules=None) -> dict:
def analyze_metadata(self, path: str, info, rules=None, name: Optional[str] = None,
version: Optional[str] = None) -> dict:
"""
Analyzes the metadata of a given package
Expand All @@ -122,7 +124,7 @@ def analyze_metadata(self, path: str, info, rules=None) -> dict:

for rule in all_rules:
try:
rule_matches, message = self.metadata_detectors[rule].detect(info, path)
rule_matches, message = self.metadata_detectors[rule].detect(info, path, name, version)
if rule_matches:
issues += 1
results[rule] = message
Expand Down
3 changes: 2 additions & 1 deletion guarddog/analyzer/metadata/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ def __init__(self) -> None:

# returns (ruleMatches, message)
@abstractmethod
def detect(self, package_info, path: Optional[str] = None) -> tuple[bool, Optional[str]]:
def detect(self, package_info, path: Optional[str] = None, name: Optional[str] = None,
version: Optional[str] = None) -> tuple[bool, Optional[str]]:
pass # pragma: no cover
4 changes: 3 additions & 1 deletion guarddog/analyzer/metadata/empty_information.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ class EmptyInfoDetector(Detector):
RULE_NAME = "empty_information"

@abstractmethod
def detect(self, package_info, path: Optional[str] = None) -> tuple[bool, str]:
def detect(self, package_info, path: Optional[str] = None, name: Optional[str] = None,
version: Optional[str] = None) -> tuple[bool, str]:
"""
Uses a package's information from PyPI's JSON API to determine
if the package has an empty description
Expand All @@ -29,5 +30,6 @@ def detect(self, package_info, path: Optional[str] = None) -> tuple[bool, str]:
Returns:
bool: True if package description is empty
@param **kwargs:
"""
pass
3 changes: 2 additions & 1 deletion guarddog/analyzer/metadata/npm/empty_information.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@

class NPMEmptyInfoDetector(EmptyInfoDetector):

def detect(self, package_info, path: Optional[str] = None) -> tuple[bool, str]:
def detect(self, package_info, path: Optional[str] = None, name: Optional[str] = None,
version: Optional[str] = None) -> tuple[bool, str]:
if path is None:
raise TypeError("path must be a string")
package_path = os.path.join(path, "package")
Expand Down
3 changes: 2 additions & 1 deletion guarddog/analyzer/metadata/npm/release_zero.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

class NPMReleaseZeroDetector(ReleaseZeroDetector):

def detect(self, package_info, path: Optional[str] = None) -> tuple[bool, str]:
def detect(self, package_info, path: Optional[str] = None, name: Optional[str] = None,
version: Optional[str] = None) -> tuple[bool, str]:
return package_info["dist-tags"]["latest"] in ["0.0.0", "0.0", "0"],\
ReleaseZeroDetector.MESSAGE_TEMPLATE % package_info["dist-tags"]["latest"]
3 changes: 2 additions & 1 deletion guarddog/analyzer/metadata/npm/typosquatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ def _get_top_packages(self) -> list:

return list(map(lambda x: x["project"], top_packages_data))

def detect(self, package_info, path: Optional[str] = None) -> tuple[bool, Optional[str]]:
def detect(self, package_info, path: Optional[str] = None, name: Optional[str] = None,
version: Optional[str] = None) -> tuple[bool, Optional[str]]:
"""
Uses a package's information from PyPI's JSON API to determine the
package is attempting a typosquatting attack
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ def _get_domain_creation_date(self, email_domain) -> tuple[Optional[datetime], b

return creation_dates, True

def detect(self, package_info, path: Optional[str] = None) -> tuple[bool, str]:
def detect(self, package_info, path: Optional[str] = None, name: Optional[str] = None,
version: Optional[str] = None) -> tuple[bool, str]:
"""
Uses a package's information from PyPI's JSON API to determine
if the package's email domain might have been compromised
Expand Down
4 changes: 3 additions & 1 deletion guarddog/analyzer/metadata/pypi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from guarddog.analyzer.metadata.pypi.potentially_compromised_email_domain import \
PypiPotentiallyCompromisedEmailDomainDetector
from guarddog.analyzer.metadata.pypi.release_zero import PypiReleaseZeroDetector
from guarddog.analyzer.metadata.pypi.repository_integrity_mismatch import PypiIntegrityMismatchDetector
from guarddog.analyzer.metadata.pypi.typosquatting import PypiTyposquatDetector

PYPI_METADATA_RULES = {}
Expand All @@ -10,7 +11,8 @@
PypiEmptyInfoDetector,
PypiReleaseZeroDetector,
PypiTyposquatDetector,
PypiPotentiallyCompromisedEmailDomainDetector
PypiPotentiallyCompromisedEmailDomainDetector,
PypiIntegrityMismatchDetector
]

for cls in classes:
Expand Down
3 changes: 2 additions & 1 deletion guarddog/analyzer/metadata/pypi/empty_information.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@


class PypiEmptyInfoDetector(EmptyInfoDetector):
def detect(self, package_info, path: Optional[str] = None) -> tuple[bool, str]:
def detect(self, package_info, path: Optional[str] = None, name: Optional[str] = None,
version: Optional[str] = None) -> tuple[bool, str]:
return len(package_info["info"]["description"].strip()) == 0, EmptyInfoDetector.MESSAGE_TEMPLATE % "PyPI"
3 changes: 2 additions & 1 deletion guarddog/analyzer/metadata/pypi/release_zero.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

class PypiReleaseZeroDetector(ReleaseZeroDetector):

def detect(self, package_info, path: Optional[str] = None) -> tuple[bool, str]:
def detect(self, package_info, path: Optional[str] = None, name: Optional[str] = None,
version: Optional[str] = None) -> tuple[bool, str]:
return (package_info["info"]["version"] in ["0.0.0", "0.0"],
ReleaseZeroDetector.MESSAGE_TEMPLATE % package_info["info"]["version"])
Loading

0 comments on commit 2da2ac9

Please sign in to comment.