Skip to content

Commit 7ae891f

Browse files
authored
feat: use provenance to find commits for supported PURL types. (#653)
This PR allows for the extraction of repository URLs and related commits from provenance files. Supported provenance includes SLSA v0.1, 0.2, and 1, as well as Witness v0.1. This feature takes effect when a user supplies a provenance as input to the analysis, or when one can be retrieved from npm or a configured JFrog repository, as applicable. Signed-off-by: Ben Selwyn-Smith <[email protected]>
1 parent 38cb543 commit 7ae891f

File tree

17 files changed

+1495
-149
lines changed

17 files changed

+1495
-149
lines changed

scripts/dev_scripts/integration_tests.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,15 @@ if [[ -z "$NO_NPM_TEST" ]]; then
9999
$RUN_MACARON analyze -purl pkg:npm/@sigstore/[email protected] -rp https://github.com/sigstore/sigstore-js -b main -d ebdcfdfbdfeb9c9aeee6df53674ef230613629f5 --skip-deps || log_fail
100100

101101
check_or_update_expected_output $COMPARE_JSON_OUT $JSON_RESULT $JSON_EXPECTED || log_fail
102+
103+
echo -e "\n----------------------------------------------------------------------------------"
104+
echo "[email protected]: Extracting repository URL and commit from provenance while Repo Finder is disabled."
105+
echo -e "----------------------------------------------------------------------------------\n"
106+
JSON_EXPECTED=$WORKSPACE/tests/e2e/expected_results/purl/npm/semver/semver.json
107+
JSON_RESULT=$WORKSPACE/output/reports/npm/semver/semver.json
108+
$RUN_MACARON -dp tests/e2e/defaults/disable_repo_finder.ini analyze -purl pkg:npm/[email protected] || log_fail
109+
110+
check_or_update_expected_output $COMPARE_JSON_OUT $JSON_RESULT $JSON_EXPECTED || log_fail
102111
fi
103112

104113
echo -e "\n----------------------------------------------------------------------------------"

src/macaron/__main__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def analyze_slsa_levels_single(analyzer_single_args: argparse.Namespace) -> None
142142
run_config,
143143
analyzer_single_args.sbom_path,
144144
analyzer_single_args.skip_deps,
145-
prov_payload=prov_payload,
145+
provenance_payload=prov_payload,
146146
)
147147
sys.exit(status_code)
148148

src/macaron/errors.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,15 @@ class InvalidHTTPResponseError(MacaronError):
5858

5959
class CheckRegistryError(MacaronError):
6060
"""The Check Registry Error class."""
61+
62+
63+
class ProvenanceError(MacaronError):
64+
"""When there is an error while extracting from provenance."""
65+
66+
67+
class JsonError(MacaronError):
68+
"""When there is an error while extracting from JSON."""
69+
70+
71+
class InvalidAnalysisTargetError(MacaronError):
72+
"""When a valid Analysis Target cannot be constructed."""

src/macaron/json_tools.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""This module provides utility functions for JSON data."""
5+
6+
from typing import TypeVar
7+
8+
from macaron.errors import JsonError
9+
from macaron.util import JsonType
10+
11+
T = TypeVar("T", bound=JsonType)
12+
13+
14+
def json_extract(entry: JsonType, keys: list[str], type_: type[T]) -> T:
15+
"""Return the value found by following the list of depth-sequential keys inside the passed JSON dictionary.
16+
17+
The value must be of the passed type.
18+
19+
Parameters
20+
----------
21+
entry: JsonType
22+
An entry point into a JSON structure.
23+
keys: list[str]
24+
The list of depth-sequential keys within the JSON.
25+
type: type[T]
26+
The type to check the value against and return it as.
27+
28+
Returns
29+
-------
30+
T:
31+
The found value as the type of the type parameter.
32+
33+
Raises
34+
------
35+
JsonError
36+
Raised if an error occurs while searching for or validating the value.
37+
"""
38+
target = entry
39+
40+
for index, key in enumerate(keys):
41+
if not isinstance(target, dict):
42+
raise JsonError(f"Expect the value .{'.'.join(keys[:index])} to be a dict.")
43+
if key not in target:
44+
raise JsonError(f"JSON key '{key}' not found in .{'.'.join(keys[:index])}.")
45+
target = target[key]
46+
47+
if isinstance(target, type_):
48+
return target
49+
50+
raise JsonError(f"Expect the value .{'.'.join(keys)} to be of type '{type_}'.")
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""This module contains methods for extracting repository and commit metadata from provenance files."""
5+
import logging
6+
7+
from macaron.errors import JsonError, ProvenanceError
8+
from macaron.json_tools import json_extract
9+
from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, InTotoV1Payload, InTotoV01Payload
10+
from macaron.util import JsonType
11+
12+
logger: logging.Logger = logging.getLogger(__name__)
13+
14+
15+
SLSA_V01_DIGEST_SET_GIT_ALGORITHMS = ["sha1"]
16+
SLSA_V02_DIGEST_SET_GIT_ALGORITHMS = ["sha1"]
17+
SLSA_V1_DIGEST_SET_GIT_ALGORITHMS = ["sha1", "gitCommit"]
18+
19+
20+
def extract_repo_and_commit_from_provenance(payload: InTotoPayload) -> tuple[str, str]:
21+
"""Extract the repository and commit metadata from the passed provenance payload.
22+
23+
Parameters
24+
----------
25+
payload: InTotoPayload
26+
The payload to extract from.
27+
28+
Returns
29+
-------
30+
tuple[str, str]
31+
The repository URL and commit hash if found, a pair of empty strings otherwise.
32+
33+
Raises
34+
------
35+
ProvenanceError
36+
If the extraction process fails for any reason.
37+
"""
38+
repo = ""
39+
commit = ""
40+
predicate_type = payload.statement.get("predicateType")
41+
try:
42+
if isinstance(payload, InTotoV1Payload):
43+
if predicate_type == "https://slsa.dev/provenance/v1":
44+
repo, commit = _extract_from_slsa_v1(payload)
45+
elif isinstance(payload, InTotoV01Payload):
46+
if predicate_type == "https://slsa.dev/provenance/v0.2":
47+
repo, commit = _extract_from_slsa_v02(payload)
48+
if predicate_type == "https://slsa.dev/provenance/v0.1":
49+
repo, commit = _extract_from_slsa_v01(payload)
50+
if predicate_type == "https://witness.testifysec.com/attestation-collection/v0.1":
51+
repo, commit = _extract_from_witness_provenance(payload)
52+
except JsonError as error:
53+
logger.debug(error)
54+
raise ProvenanceError("JSON exception while extracting from provenance.") from error
55+
56+
if not repo or not commit:
57+
msg = (
58+
f"Extraction from provenance not supported for versions: "
59+
f"predicate_type {predicate_type}, in-toto {str(type(payload))}."
60+
)
61+
logger.debug(msg)
62+
raise ProvenanceError(msg)
63+
64+
logger.debug("Extracted repo and commit from provenance: %s, %s", repo, commit)
65+
return repo, commit
66+
67+
68+
def _extract_from_slsa_v01(payload: InTotoV01Payload) -> tuple[str, str]:
69+
"""Extract the repository and commit metadata from the slsa v01 provenance payload."""
70+
predicate: dict[str, JsonType] | None = payload.statement.get("predicate")
71+
if not predicate:
72+
raise ProvenanceError("No predicate in payload statement.")
73+
74+
# The repository URL and commit are stored inside an entry in the list of predicate -> materials.
75+
# In predicate -> recipe -> definedInMaterial we find the list index that points to the correct entry.
76+
list_index = json_extract(predicate, ["recipe", "definedInMaterial"], int)
77+
material_list = json_extract(predicate, ["materials"], list)
78+
if list_index >= len(material_list):
79+
raise ProvenanceError("Material list index outside of material list bounds.")
80+
material = material_list[list_index]
81+
if not material or not isinstance(material, dict):
82+
raise ProvenanceError("Indexed material list entry is invalid.")
83+
84+
uri = json_extract(material, ["uri"], str)
85+
86+
repo = _clean_spdx(uri)
87+
88+
digest_set = json_extract(material, ["digest"], dict)
89+
commit = _extract_commit_from_digest_set(digest_set, SLSA_V01_DIGEST_SET_GIT_ALGORITHMS)
90+
91+
if not commit:
92+
raise ProvenanceError("Failed to extract commit hash from provenance.")
93+
94+
return repo, commit
95+
96+
97+
def _extract_from_slsa_v02(payload: InTotoV01Payload) -> tuple[str, str]:
98+
"""Extract the repository and commit metadata from the slsa v02 provenance payload."""
99+
predicate: dict[str, JsonType] | None = payload.statement.get("predicate")
100+
if not predicate:
101+
raise ProvenanceError("No predicate in payload statement.")
102+
103+
# The repository URL and commit are stored within the predicate -> invocation -> configSource object.
104+
# See https://slsa.dev/spec/v0.2/provenance
105+
uri = json_extract(predicate, ["invocation", "configSource", "uri"], str)
106+
if not uri:
107+
raise ProvenanceError("Failed to extract repository URL from provenance.")
108+
repo = _clean_spdx(uri)
109+
110+
digest_set = json_extract(predicate, ["invocation", "configSource", "digest"], dict)
111+
commit = _extract_commit_from_digest_set(digest_set, SLSA_V02_DIGEST_SET_GIT_ALGORITHMS)
112+
113+
if not commit:
114+
raise ProvenanceError("Failed to extract commit hash from provenance.")
115+
116+
return repo, commit
117+
118+
119+
def _extract_from_slsa_v1(payload: InTotoV1Payload) -> tuple[str, str]:
120+
"""Extract the repository and commit metadata from the slsa v1 provenance payload."""
121+
predicate: dict[str, JsonType] | None = payload.statement.get("predicate")
122+
if not predicate:
123+
raise ProvenanceError("No predicate in payload statement.")
124+
125+
build_def = json_extract(predicate, ["buildDefinition"], dict)
126+
build_type = json_extract(build_def, ["buildType"], str)
127+
128+
# Extract the repository URL.
129+
repo = ""
130+
if build_type == "https://slsa-framework.github.io/gcb-buildtypes/triggered-build/v1":
131+
try:
132+
repo = json_extract(build_def, ["externalParameters", "sourceToBuild", "repository"], str)
133+
except JsonError:
134+
repo = json_extract(build_def, ["externalParameters", "configSource", "repository"], str)
135+
if build_type == "https://slsa-framework.github.io/github-actions-buildtypes/workflow/v1":
136+
repo = json_extract(build_def, ["externalParameters", "workflow", "repository"], str)
137+
138+
if not repo:
139+
raise ProvenanceError("Failed to extract repository URL from provenance.")
140+
141+
# Extract the commit hash.
142+
commit = ""
143+
deps = json_extract(build_def, ["resolvedDependencies"], list)
144+
for dep in deps:
145+
if not isinstance(dep, dict):
146+
continue
147+
uri = json_extract(dep, ["uri"], str)
148+
url = _clean_spdx(uri)
149+
if url != repo:
150+
continue
151+
digest_set = json_extract(dep, ["digest"], dict)
152+
commit = _extract_commit_from_digest_set(digest_set, SLSA_V1_DIGEST_SET_GIT_ALGORITHMS)
153+
154+
if not commit:
155+
raise ProvenanceError("Failed to extract commit hash from provenance.")
156+
157+
return repo, commit
158+
159+
160+
def _extract_from_witness_provenance(payload: InTotoV01Payload) -> tuple[str, str]:
161+
"""Extract the repository and commit metadata from the witness provenance file found at the passed path.
162+
163+
To successfully return the commit and repository URL, the payload must respectively contain a Git attestation, and
164+
either a GitHub or GitLab attestation.
165+
166+
Parameters
167+
----------
168+
payload: InTotoPayload
169+
The payload to extract from.
170+
171+
Returns
172+
-------
173+
tuple[str, str]
174+
The repository URL and commit hash if found, a pair of empty strings otherwise.
175+
"""
176+
predicate: dict[str, JsonType] | None = payload.statement.get("predicate")
177+
if not predicate:
178+
raise ProvenanceError("No predicate in payload statement.")
179+
180+
attestations = json_extract(predicate, ["attestations"], list)
181+
commit = ""
182+
repo = ""
183+
for entry in attestations:
184+
if not isinstance(entry, dict):
185+
continue
186+
entry_type = entry.get("type")
187+
if not entry_type:
188+
continue
189+
if entry_type.startswith("https://witness.dev/attestations/git/"):
190+
commit = json_extract(entry, ["attestation", "commithash"], str)
191+
elif entry_type.startswith("https://witness.dev/attestations/gitlab/") or entry_type.startswith(
192+
"https://witness.dev/attestations/github/"
193+
):
194+
repo = json_extract(entry, ["attestation", "projecturl"], str)
195+
196+
if not commit or not repo:
197+
raise ProvenanceError("Could not extract repo and commit from provenance.")
198+
199+
return repo, commit
200+
201+
202+
def _extract_commit_from_digest_set(digest_set: dict[str, JsonType], valid_algorithms: list[str]) -> str:
203+
"""Extract the commit from the passed DigestSet.
204+
205+
The DigestSet is an in-toto object that maps algorithm types to commit hashes (digests).
206+
"""
207+
if len(digest_set.keys()) > 1:
208+
logger.debug("DigestSet contains multiple algorithms: %s", digest_set.keys())
209+
210+
for key in digest_set:
211+
if key in valid_algorithms:
212+
value = digest_set.get(key)
213+
if isinstance(value, str):
214+
return value
215+
raise ProvenanceError(f"No valid digest in digest set: {digest_set.keys()} not in {valid_algorithms}")
216+
217+
218+
def _clean_spdx(uri: str) -> str:
219+
"""Clean the passed SPDX URI and return the normalised URL it represents.
220+
221+
A SPDX URI has the form: git+https://example.com@refs/heads/main
222+
"""
223+
url, _, _ = uri.lstrip("git+").rpartition("@")
224+
return url

0 commit comments

Comments
 (0)